%!s(int64=13) %!d(string=hai) anos · cc52f5a009
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -1034,6 +1034,10 @@ Check if the worker specified by workerid can execute the codelet. Schedulers ne
 
				 Return the current date in µs
			
 
				 @end deftypefun
			
 
				 
			
 
				+@deftypefun uint32_t starpu_task_footprint ({struct starpu_perfmodel *}@var{model}, {struct starpu_task *} @var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
			
 
				+Returns the footprint for a given task
			
 
				+@end deftypefun
			
 
				+
			
 
				 @deftypefun double starpu_task_expected_length ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
			
 
				 Returns expected task duration in µs
			
 
				 @end deftypefun
			
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -234,7 +234,7 @@ starpu_vector_data_register(&handle, 0, (uintptr_t)vector,
 
				 /* Partition the vector in PARTS sub-vectors */
			
 
				 starpu_data_filter f =
			
 
				 @{
			
 
				-    .filter_func = starpu_block_filter_func_vector,
			
 
				+    .filter_func = starpu_vector_filter_block,
			
 
				     .nchildren = PARTS
			
 
				 @};
			
 
				 starpu_data_partition(handle, &f);
			
@@ -430,11 +430,14 @@ a name which is different from the execution time performance model.
 
				 
			
 
				 The application can request time estimations from the StarPU performance
			
 
				 models by filling a task structure as usual without actually submitting
			
 
				-it. The data handles can be created by calling @code{starpu_data_register}
			
 
				-functions with a @code{NULL} pointer (and need to be unregistered as usual)
			
 
				-and the desired data sizes. The @code{starpu_task_expected_length} and
			
 
				-@code{starpu_task_expected_power} functions can then be called to get an
			
 
				-estimation of the task duration on a given arch. @code{starpu_task_destroy}
			
 
				+it. The data handles can be created by calling @code{starpu_*_data_register}
			
 
				+functions with a @code{NULL} pointer and @code{-1} node and the
			
 
				+desired data sizes, and need to be unregistered as usual. The
			
 
				+@code{starpu_task_expected_length} and @code{starpu_task_expected_power}
			
 
				+functions can then be called to get an estimation of the task cost on a given
			
 
				+arch. @code{starpu_task_footprint} can also be used to get the footprint used
			
 
				+for indexing history-based performance models.
			
 
				+@code{starpu_task_destroy}
			
 
				 needs to be called to destroy the dummy task afterwards. See
			
 
				 @code{tests/perfmodels/regression_based.c} for an example.
			
 
				 
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -1221,12 +1221,12 @@ Return the size of the elements registered into the matrix designated by
 
				 Applications can provide their own interface. An example is provided in
			
 
				 @code{examples/interface}. A few helpers are provided.
			
 
				 
			
 
				-@deftypefun uintptr_t starpu_allocate_buffer_on_node (unsigned @var{dst_node}, size_t @var{size})
			
 
				+@deftypefun uintptr_t starpu_malloc_on_node (unsigned @var{dst_node}, size_t @var{size})
			
 
				 Allocate @var{size} bytes on node @var{dst_node}. This returns 0 if allocation
			
 
				 failed, the allocation method should then return -ENOMEM as allocated size.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_free_buffer_on_node (unsigned @var{dst_node}, uintptr_t @var{addr}, size_t @var{size})
			
 
				+@deftypefun void starpu_free_on_node (unsigned @var{dst_node}, uintptr_t @var{addr}, size_t @var{size})
			
 
				 Free @var{addr} of @var{size} bytes on node @var{dst_node}.
			
 
				 @end deftypefun
			
 
				 
			
@@ -1280,7 +1280,7 @@ subdata according to the filter @var{f}, as shown in the following example:
 
				 @cartouche
			
 
				 @smallexample
			
 
				 struct starpu_data_filter f = @{
			
 
				-    .filter_func = starpu_block_filter_func,
			
 
				+    .filter_func = starpu_matrix_filter_block,
			
 
				     .nchildren = nslicesx,
			
 
				     .get_nchildren = NULL,
			
 
				     .get_child_ops = NULL
			
@@ -1359,13 +1359,13 @@ list can be found in @code{starpu_data_filters.h} .
 
				 @node Partitioning Vector Data
			
 
				 @subsubsection Partitioning Vector Data
			
 
				 
			
 
				-@deftypefun void starpu_block_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_vector_filter_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 Return in @code{*@var{child_interface}} the @var{id}th element of the
			
 
				 vector represented by @var{father_interface} once partitioned in
			
 
				 @var{nparts} chunks of equal size.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_block_shadow_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_vector_filter_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 Return in @code{*@var{child_interface}} the @var{id}th element of the
			
 
				 vector represented by @var{father_interface} once partitioned in
			
 
				 @var{nparts} chunks of equal size with a shadow border @code{filter_arg_ptr}, thus getting a vector of size (n-2*shadow)/nparts+2*shadow
			
@@ -1378,7 +1378,7 @@ enforced for the shadowed parts.
 
				 A usage example is available in examples/filters/shadow.c
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_vector_filter_list (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 Return in @code{*@var{child_interface}} the @var{id}th element of the
			
 
				 vector represented by @var{father_interface} once partitioned into
			
 
				 @var{nparts} chunks according to the @code{filter_arg_ptr} field of
			
@@ -1389,7 +1389,7 @@ The @code{filter_arg_ptr} field must point to an array of @var{nparts}
 
				 in each chunk of the partition.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_vector_divide_in_2_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_vector_filter_divide_in_2 (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 Return in @code{*@var{child_interface}} the @var{id}th element of the
			
 
				 vector represented by @var{father_interface} once partitioned in two
			
 
				 chunks of equal size, ignoring @var{nparts}.  Thus, @var{id} must be
			
@@ -1400,13 +1400,13 @@ chunks of equal size, ignoring @var{nparts}.  Thus, @var{id} must be
 
				 @node Partitioning Matrix Data
			
 
				 @subsubsection Partitioning Matrix Data
			
 
				 
			
 
				-@deftypefun void starpu_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_matrix_filter_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a dense Matrix along the x dimension, thus getting (x/nparts,y)
			
 
				 matrices. If nparts does not divide x, the last submatrix contains the
			
 
				 remainder.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_block_shadow_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_matrix_filter_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a dense Matrix along the x dimension, with a shadow border
			
 
				 @code{filter_arg_ptr}, thus getting ((x-2*shadow)/nparts+2*shadow,y)
			
 
				 matrices. If nparts does not divide x-2*shadow, the last submatrix contains the
			
@@ -1418,13 +1418,13 @@ enforced for the shadowed parts.
 
				 A usage example is available in examples/filters/shadow2d.c
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_vertical_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_matrix_filter_vertical_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a dense Matrix along the y dimension, thus getting (x,y/nparts)
			
 
				 matrices. If nparts does not divide y, the last submatrix contains the
			
 
				 remainder.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_vertical_block_shadow_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_matrix_filter_vertical_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a dense Matrix along the y dimension, with a shadow border
			
 
				 @code{filter_arg_ptr}, thus getting (x,(y-2*shadow)/nparts+2*shadow)
			
 
				 matrices. If nparts does not divide y-2*shadow, the last submatrix contains the
			
@@ -1441,13 +1441,13 @@ A usage example is available in examples/filters/shadow2d.c
 
				 
			
 
				 A usage example is available in examples/filters/shadow3d.c
			
 
				 
			
 
				-@deftypefun void starpu_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_block_filter_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a 3D matrix along the X dimension, thus getting (x/nparts,y,z)
			
 
				 3D matrices. If nparts does not divide x, the last submatrix contains the
			
 
				 remainder.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_block_filter_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a 3D matrix along the X dimension, with a shadow border
			
 
				 @code{filter_arg_ptr}, thus getting ((x-2*shadow)/nparts+2*shadow,y,z) 3D
			
 
				 matrices. If nparts does not divide x, the last submatrix contains the
			
@@ -1457,13 +1457,13 @@ IMPORTANT: This can only be used for read-only access, as no coherency is
 
				 enforced for the shadowed parts.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_vertical_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_block_filter_vertical_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a 3D matrix along the Y dimension, thus getting (x,y/nparts,z)
			
 
				 3D matrices. If nparts does not divide y, the last submatrix contains the
			
 
				 remainder.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_vertical_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_block_filter_vertical_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a 3D matrix along the Y dimension, with a shadow border
			
 
				 @code{filter_arg_ptr}, thus getting (x,(y-2*shadow)/nparts+2*shadow,z) 3D
			
 
				 matrices. If nparts does not divide y, the last submatrix contains the
			
@@ -1473,13 +1473,13 @@ IMPORTANT: This can only be used for read-only access, as no coherency is
 
				 enforced for the shadowed parts.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_depth_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_block_filter_depth_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a 3D matrix along the Z dimension, thus getting (x,y,z/nparts)
			
 
				 3D matrices. If nparts does not divide z, the last submatrix contains the
			
 
				 remainder.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_depth_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_block_filter_depth_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a 3D matrix along the Z dimension, with a shadow border
			
 
				 @code{filter_arg_ptr}, thus getting (x,y,(z-2*shadow)/nparts+2*shadow)
			
 
				 3D matrices. If nparts does not divide z, the last submatrix contains the
			
@@ -1492,11 +1492,11 @@ enforced for the shadowed parts.
 
				 @node Partitioning BCSR Data
			
 
				 @subsubsection Partitioning BCSR Data
			
 
				 
			
 
				-@deftypefun void starpu_canonical_block_filter_bcsr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_bcsr_filter_canonical_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a block-sparse matrix into dense matrices.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_vertical_block_filter_func_csr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				+@deftypefun void starpu_csr_filter_vertical_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
			
 
				 This partitions a block-sparse matrix into vertical block-sparse matrices.
			
 
				 @end deftypefun
			
 
				 
			
@@ -2533,15 +2533,15 @@ whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
 
				 the @code{starpu_conf} structure.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_helper_cublas_init (void)
			
 
				+@deftypefun void starpu_cublas_init (void)
			
 
				 This function initializes CUBLAS on every CUDA device.
			
 
				 The CUBLAS library must be initialized prior to any CUBLAS call. Calling
			
 
				-@code{starpu_helper_cublas_init} will initialize CUBLAS on every CUDA device
			
 
				+@code{starpu_cublas_init} will initialize CUBLAS on every CUDA device
			
 
				 controlled by StarPU. This call blocks until CUBLAS has been properly
			
 
				 initialized on every device.
			
 
				 @end deftypefun
			
 
				 
			
 
				-@deftypefun void starpu_helper_cublas_shutdown (void)
			
 
				+@deftypefun void starpu_cublas_shutdown (void)
			
 
				 This function synchronously deinitializes the CUBLAS library on every CUDA device.
			
 
				 @end deftypefun
			
 
				 
			
--- a/doc/chapters/tips-tricks.texi
+++ b/doc/chapters/tips-tricks.texi
@@ -61,7 +61,7 @@ static void fft(void *descr[], void *_args)
 
				 Another way to go which may be needed is to execute some code from the workers
			
 
				 themselves thanks to @code{starpu_execute_on_each_worker}. This may be required
			
 
				 by CUDA to behave properly due to threading issues. For instance, StarPU's
			
 
				-@code{starpu_helper_cublas_init} looks like the following to call
			
 
				+@code{starpu_cublas_init} looks like the following to call
			
 
				 @code{cublasInit} from the workers themselves:
			
 
				 
			
 
				 @cartouche
			
@@ -71,7 +71,7 @@ static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 
				     cublasStatus cublasst = cublasInit();
			
 
				     cublasSetKernelStream(starpu_cuda_get_local_stream());
			
 
				 @}
			
 
				-void starpu_helper_cublas_init(void)
			
 
				+void starpu_cublas_init(void)
			
 
				 @{
			
 
				     starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
			
 
				 @}
			
--- a/doc/tutorial/hello_world.c
+++ b/doc/tutorial/hello_world.c
@@ -32,7 +32,6 @@ void cpu_func(void *buffers[], void *cl_arg)
 
				 
			
 
				 struct starpu_codelet cl =
			
 
				 {
			
 
				-    .where = STARPU_CPU,
			
 
				     .cpu_funcs = {cpu_func, NULL},
			
 
				     .nbuffers = 0
			
 
				 };
			
--- a/doc/tutorial/vector_scal.c
+++ b/doc/tutorial/vector_scal.c
@@ -31,7 +31,6 @@ extern void scal_cuda_func(void *buffers[], void *_args);
 
				 extern void scal_opencl_func(void *buffers[], void *_args);
			
 
				 
			
 
				 static struct starpu_codelet cl = {
			
 
				-    .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
			
 
				     /* CPU implementation of the codelet */
			
 
				     .cpu_funcs = {scal_cpu_func, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/examples/audio/starpu_audio_processing.c
+++ b/examples/audio/starpu_audio_processing.c
@@ -283,7 +283,6 @@ struct starpu_perfmodel band_filter_model =
 
				 static struct starpu_codelet band_filter_cl =
			
 
				 {
			
 
				 	.modes = { STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {band_filter_kernel_gpu, NULL},
			
 
				 #endif
			
@@ -413,13 +412,13 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	starpu_vector_data_register(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
			
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func_vector,
			
 
				+		.filter_func = starpu_vector_filter_block,
			
 
				 		.nchildren = niter
			
 
				 	};
			
 
				 
			
@@ -463,7 +462,7 @@ int main(int argc, char **argv)
 
				 	starpu_data_unpartition(A_handle, 0);
			
 
				 	starpu_data_unregister(A_handle);
			
 
				 
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 
			
 
				 	/* we are done ! */
			
 
				 	starpu_shutdown();
			
--- a/examples/axpy/axpy.c
+++ b/examples/axpy/axpy.c
@@ -128,7 +128,7 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				 #endif
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	/* This is equivalent to
			
 
				 		vec_a = malloc(N*sizeof(TYPE));
			
@@ -157,7 +157,7 @@ int main(int argc, char **argv)
 
				 	/* Divide the vector into blocks */
			
 
				 	struct starpu_data_filter block_filter =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func_vector,
			
 
				+		.filter_func = starpu_vector_filter_block,
			
 
				 		.nchildren = NBLOCKS
			
 
				 	};
			
 
				 
			
--- a/examples/basic_examples/hello_world.c
+++ b/examples/basic_examples/hello_world.c
@@ -82,7 +82,6 @@ int main(int argc, char **argv)
 
				 
			
 
				 	/* this codelet may only be executed on a CPU, and its cpu
			
 
				  	 * implementation is function "cpu_func" */
			
 
				-	cl.where = STARPU_CPU;
			
 
				 	cl.cpu_funcs[0] = cpu_func;
			
 
				 	/* the codelet does not manipulate any data that is managed
			
 
				 	 * by our DSM */
			
--- a/examples/basic_examples/mult.c
+++ b/examples/basic_examples/mult.c
@@ -194,13 +194,13 @@ static void partition_mult_data(void)
 
				 	 * name of the filters are a bit misleading */
			
 
				 	struct starpu_data_filter vert =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = nslicesx
			
 
				 	};
			
 
				 
			
 
				 	struct starpu_data_filter horiz =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = nslicesy
			
 
				 	};
			
 
				 
			
@@ -263,7 +263,6 @@ static struct starpu_perfmodel mult_perf_model =
 
				 static struct starpu_codelet cl =
			
 
				 {
			
 
				         /* we can only execute that kernel on a CPU yet */
			
 
				-        .where = STARPU_CPU,
			
 
				         /* CPU implementation of the codelet */
			
 
				         .cpu_funcs = {cpu_mult, NULL},
			
 
				         /* the codelet manipulates 3 buffers that are managed by the
			
--- a/examples/basic_examples/multiformat.c
+++ b/examples/basic_examples/multiformat.c
@@ -79,7 +79,6 @@ extern void multiformat_scal_opencl_func(void *buffers[], void *arg);
 
				 #ifdef STARPU_USE_CPU
			
 
				 static struct starpu_codelet cpu_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {multiformat_scal_cpu_func, NULL},
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = { STARPU_RW },
			
@@ -90,7 +89,6 @@ static struct starpu_codelet cpu_cl =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static struct starpu_codelet cuda_cl =
			
 
				 {
			
 
				-	.where = STARPU_CUDA,
			
 
				 	.cuda_funcs = { multiformat_scal_cuda_func, NULL },
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = { STARPU_RW },
			
@@ -101,7 +99,6 @@ static struct starpu_codelet cuda_cl =
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static struct starpu_codelet opencl_cl =
			
 
				 {
			
 
				-	.where = STARPU_OPENCL,
			
 
				 	.opencl_funcs = { multiformat_scal_opencl_func, NULL },
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = { STARPU_RW },
			
--- a/examples/basic_examples/multiformat_conversion_codelets.c
+++ b/examples/basic_examples/multiformat_conversion_codelets.c
@@ -34,7 +34,6 @@ void cuda_to_cpu(void *buffers[], void *arg)
 
				 extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
			
 
				 struct starpu_codelet cpu_to_cuda_cl =
			
 
				 {
			
 
				-	.where = STARPU_CUDA,
			
 
				 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
			
 
				 	.nbuffers = 1,
			
 
				 	.name = "codelet_cpu_to_cuda"
			
@@ -42,7 +41,6 @@ struct starpu_codelet cpu_to_cuda_cl =
 
				 
			
 
				 struct starpu_codelet cuda_to_cpu_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {cuda_to_cpu, NULL},
			
 
				 	.nbuffers = 1,
			
 
				 	.name = "codelet_cude_to_cpu"
			
@@ -67,14 +65,12 @@ void opencl_to_cpu(void *buffers[], void *arg)
 
				 extern void cpu_to_opencl_opencl_func(void *buffers[], void *args);
			
 
				 struct starpu_codelet cpu_to_opencl_cl =
			
 
				 {
			
 
				-	.where = STARPU_OPENCL,
			
 
				 	.opencl_funcs = {cpu_to_opencl_opencl_func, NULL},
			
 
				 	.nbuffers = 1
			
 
				 };
			
 
				 
			
 
				 struct starpu_codelet opencl_to_cpu_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {opencl_to_cpu, NULL},
			
 
				 	.nbuffers = 1
			
 
				 };
			
--- a/examples/basic_examples/variable.c
+++ b/examples/basic_examples/variable.c
@@ -59,7 +59,6 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				 #endif
			
 
				 
			
 
				-	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
			
 
				         cl.cpu_funcs[0] = cpu_codelet;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				         cl.cuda_funcs[0] = cuda_codelet;
			
--- a/examples/basic_examples/vector_scal_c.c
+++ b/examples/basic_examples/vector_scal_c.c
@@ -41,7 +41,6 @@ static struct starpu_perfmodel vector_scal_model =
 
				 static struct starpu_codelet cl =
			
 
				 {
			
 
				 	.modes = { STARPU_RW },
			
 
				-	.where = STARPU_CPU | STARPU_CUDA,
			
 
				 	/* CPU implementation of the codelet */
			
 
				 	.cpu_funcs = {scal_cpu_func, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/examples/callback/callback.c
+++ b/examples/callback/callback.c
@@ -33,7 +33,6 @@ void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
				 struct starpu_codelet cl =
			
 
				 {
			
 
				 	.modes = { STARPU_RW },
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {cpu_codelet, NULL},
			
 
				 	.nbuffers = 1
			
 
				 };
			
--- a/examples/cg/cg.c
+++ b/examples/cg/cg.c
@@ -197,10 +197,10 @@ static void partition_data(void)
 
				 	 */
			
 
				 
			
 
				 	/* Partition into contiguous parts */
			
 
				-	matrix_filter_1.filter_func = starpu_block_filter_func;
			
 
				+	matrix_filter_1.filter_func = starpu_matrix_filter_block;
			
 
				 	matrix_filter_1.nchildren = nblocks;
			
 
				 	/* Partition into non-contiguous parts */
			
 
				-	matrix_filter_2.filter_func = starpu_vertical_block_filter_func;
			
 
				+	matrix_filter_2.filter_func = starpu_matrix_filter_vertical_block;
			
 
				 	matrix_filter_2.nchildren = nblocks;
			
 
				 
			
 
				 	/* A is in FORTRAN ordering, starpu_data_get_sub_data(A_handle, 2, i,
			
@@ -211,7 +211,7 @@ static void partition_data(void)
 
				 	 *	Partition the vectors
			
 
				 	 */
			
 
				 
			
 
				-	vector_filter.filter_func = starpu_block_filter_func_vector;
			
 
				+	vector_filter.filter_func = starpu_vector_filter_block;
			
 
				 	vector_filter.nchildren = nblocks;
			
 
				 
			
 
				 	starpu_data_partition(b_handle, &vector_filter);
			
@@ -417,7 +417,7 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	generate_random_problem();
			
 
				 	register_data();
			
@@ -431,7 +431,7 @@ int main(int argc, char **argv)
 
				 	starpu_task_wait_for_all();
			
 
				 	unregister_data();
			
 
				 	free_data();
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return ret;
			
--- a/examples/cg/cg_kernels.c
+++ b/examples/cg/cg_kernels.c
@@ -94,7 +94,6 @@ static struct starpu_perfmodel accumulate_variable_model =
 
				 struct starpu_codelet accumulate_variable_cl =
			
 
				 {
			
 
				 	.can_execute = can_execute,
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {accumulate_variable_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {accumulate_variable_cuda, NULL},
			
@@ -133,7 +132,6 @@ static struct starpu_perfmodel accumulate_vector_model =
 
				 struct starpu_codelet accumulate_vector_cl =
			
 
				 {
			
 
				 	.can_execute = can_execute,
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {accumulate_vector_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {accumulate_vector_cuda, NULL},
			
@@ -174,7 +172,6 @@ static struct starpu_perfmodel bzero_variable_model =
 
				 struct starpu_codelet bzero_variable_cl =
			
 
				 {
			
 
				 	.can_execute = can_execute,
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {bzero_variable_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {bzero_variable_cuda, NULL},
			
@@ -212,7 +209,6 @@ static struct starpu_perfmodel bzero_vector_model =
 
				 struct starpu_codelet bzero_vector_cl =
			
 
				 {
			
 
				 	.can_execute = can_execute,
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {bzero_vector_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {bzero_vector_cuda, NULL},
			
@@ -268,7 +264,6 @@ static struct starpu_perfmodel dot_kernel_model =
 
				 static struct starpu_codelet dot_kernel_cl =
			
 
				 {
			
 
				 	.can_execute = can_execute,
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dot_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dot_kernel_cuda, NULL},
			
@@ -348,7 +343,6 @@ static struct starpu_perfmodel scal_kernel_model =
 
				 static struct starpu_codelet scal_kernel_cl =
			
 
				 {
			
 
				 	.can_execute = can_execute,
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {scal_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {scal_kernel_cuda, NULL},
			
@@ -422,7 +416,6 @@ static struct starpu_perfmodel gemv_kernel_model =
 
				 static struct starpu_codelet gemv_kernel_cl =
			
 
				 {
			
 
				 	.can_execute = can_execute,
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SPMD,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				 	.cpu_funcs = {gemv_kernel_cpu, NULL},
			
@@ -522,7 +515,6 @@ static struct starpu_perfmodel scal_axpy_kernel_model =
 
				 static struct starpu_codelet scal_axpy_kernel_cl =
			
 
				 {
			
 
				 	.can_execute = can_execute,
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {scal_axpy_kernel_cuda, NULL},
			
@@ -597,7 +589,6 @@ static struct starpu_perfmodel axpy_kernel_model =
 
				 static struct starpu_codelet axpy_kernel_cl =
			
 
				 {
			
 
				 	.can_execute = can_execute,
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {axpy_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {axpy_kernel_cuda, NULL},
			
--- a/examples/cholesky/cholesky_grain_tag.c
+++ b/examples/cholesky/cholesky_grain_tag.c
@@ -39,7 +39,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				 static struct starpu_codelet cl11 =
			
 
				 {
			
 
				 	.modes = { STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
@@ -77,7 +76,6 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 
				 static struct starpu_codelet cl21 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
@@ -124,7 +122,6 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, un
 
				 static struct starpu_codelet cl22 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
@@ -198,13 +195,13 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				 	struct starpu_data_filter f2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
@@ -295,7 +292,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 
				 		exit(77);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	if (pinned)
			
@@ -341,7 +338,7 @@ static void shutdown_system(float **matA, unsigned pinned)
 
				 	     free(*matA);
			
 
				 	}
			
 
				 
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 }
			
 
				 
			
--- a/examples/cholesky/cholesky_implicit.c
+++ b/examples/cholesky/cholesky_implicit.c
@@ -24,7 +24,6 @@
 
				 
			
 
				 static struct starpu_codelet cl11 =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -39,7 +38,6 @@ static struct starpu_codelet cl11 =
 
				 
			
 
				 static struct starpu_codelet cl21 =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
@@ -54,7 +52,6 @@ static struct starpu_codelet cl21 =
 
				 
			
 
				 static struct starpu_codelet cl22 =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
@@ -185,13 +182,13 @@ static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				 	struct starpu_data_filter f2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
@@ -344,7 +341,7 @@ int main(int argc, char **argv)
 
				                 return 77;
			
 
				         STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	if(with_ctxs)
			
 
				 	{
			
@@ -360,7 +357,7 @@ int main(int argc, char **argv)
 
				 	else
			
 
				 		execute_cholesky(size, nblocks);
			
 
				 
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return ret;
			
--- a/examples/cholesky/cholesky_tag.c
+++ b/examples/cholesky/cholesky_tag.c
@@ -39,7 +39,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				 static struct starpu_codelet cl11 =
			
 
				 {
			
 
				 	.modes = { STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
@@ -78,7 +77,6 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 
				 static struct starpu_codelet cl21 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
@@ -127,7 +125,6 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 
				 static struct starpu_codelet cl22 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
@@ -261,7 +258,7 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	if (pinned)
			
@@ -288,13 +285,13 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				 	struct starpu_data_filter f2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
@@ -316,7 +313,7 @@ static void shutdown_system(float **matA, unsigned pinned)
 
				 		free(*matA);
			
 
				 	}
			
 
				 
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 }
			
 
				 
			
--- a/examples/cholesky/cholesky_tile_tag.c
+++ b/examples/cholesky/cholesky_tile_tag.c
@@ -42,7 +42,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				 static struct starpu_codelet cl11 =
			
 
				 {
			
 
				 	.modes = { STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
@@ -80,7 +79,6 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 
				 static struct starpu_codelet cl21 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
@@ -127,7 +125,6 @@ static int create_task_21(unsigned k, unsigned j)
 
				 static struct starpu_codelet cl22 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
@@ -260,7 +257,7 @@ int main(int argc, char **argv)
 
				 	/* Disable sequential consistency */
			
 
				 	starpu_data_set_default_sequential_consistency_flag(0);
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	for (y = 0; y < nblocks; y++)
			
@@ -321,7 +318,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 	return ret;
			
--- a/examples/filters/custom_mf/custom_conversion_codelets.c
+++ b/examples/filters/custom_mf/custom_conversion_codelets.c
@@ -39,7 +39,6 @@ void cuda_to_cpu(void *buffers[], void *arg)
 
				 extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
			
 
				 struct starpu_codelet cpu_to_cuda_cl =
			
 
				 {
			
 
				-	.where = STARPU_CUDA,
			
 
				 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
			
 
				 	.modes = { STARPU_RW },
			
 
				 	.nbuffers = 1,
			
@@ -48,7 +47,6 @@ struct starpu_codelet cpu_to_cuda_cl =
 
				 
			
 
				 struct starpu_codelet cuda_to_cpu_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {cuda_to_cpu, NULL},
			
 
				 	.modes = { STARPU_RW },
			
 
				 	.nbuffers = 1,
			
@@ -77,7 +75,6 @@ extern void cpu_to_opencl_opencl_func(void *buffers[], void *arg);
 
				 
			
 
				 struct starpu_codelet cpu_to_opencl_cl =
			
 
				 {
			
 
				-	.where = STARPU_OPENCL,
			
 
				 	.opencl_funcs = { cpu_to_opencl_opencl_func, NULL },
			
 
				 	.modes = { STARPU_RW },
			
 
				 	.nbuffers = 1,
			
@@ -86,7 +83,6 @@ struct starpu_codelet cpu_to_opencl_cl =
 
				 
			
 
				 struct starpu_codelet opencl_to_cpu_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = { opencl_to_cpu_cpu_func, NULL },
			
 
				 	.modes = { STARPU_RW },
			
 
				 	.nbuffers = 1,
			
--- a/examples/filters/custom_mf/custom_interface.c
+++ b/examples/filters/custom_mf/custom_interface.c
@@ -150,16 +150,16 @@ static ssize_t allocate_custom_buffer_on_node(void *data_interface, unsigned nod
 
				 	custom_interface = (struct custom_data_interface *) data_interface;
			
 
				 
			
 
				 	size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
			
 
				-	custom_interface->cpu_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
			
 
				+	custom_interface->cpu_ptr = (void*) starpu_malloc_on_node(node, size);
			
 
				 	if (!custom_interface->cpu_ptr)
			
 
				 		goto fail_cpu;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	custom_interface->cuda_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
			
 
				+	custom_interface->cuda_ptr = (void*) starpu_malloc_on_node(node, size);
			
 
				 	if (!custom_interface->cuda_ptr)
			
 
				 		goto fail_cuda;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	custom_interface->opencl_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
			
 
				+	custom_interface->opencl_ptr = (void*) starpu_malloc_on_node(node, size);
			
 
				 	if (!custom_interface->opencl_ptr)
			
 
				 		goto fail_opencl;
			
 
				 #endif
			
@@ -175,13 +175,13 @@ static ssize_t allocate_custom_buffer_on_node(void *data_interface, unsigned nod
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 fail_opencl:
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
			
 
				+	starpu_free_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
			
 
				 #endif
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 fail_cuda:
			
 
				 #endif
			
 
				-	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
			
 
				+	starpu_free_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
			
 
				 fail_cpu:
			
 
				 	return -ENOMEM;
			
 
				 }
			
@@ -191,12 +191,12 @@ static void free_custom_buffer_on_node(void *data_interface, unsigned node)
 
				 	struct custom_data_interface *custom_interface = (struct custom_data_interface *) data_interface;
			
 
				 	size_t size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
			
 
				 
			
 
				-	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
			
 
				+	starpu_free_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
			
 
				+	starpu_free_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				-	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->opencl_ptr, size);
			
 
				+	starpu_free_on_node(node, (uintptr_t) custom_interface->opencl_ptr, size);
			
 
				 #endif
			
 
				 }
			
 
				 
			
--- a/examples/filters/custom_mf/custom_mf_filter.c
+++ b/examples/filters/custom_mf/custom_mf_filter.c
@@ -148,7 +148,6 @@ extern void custom_scal_cuda_func(void *buffers[], void *args);
 
				 
			
 
				 static struct starpu_codelet cpu_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = { custom_scal_cpu_func, NULL},
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = { STARPU_RW },
			
@@ -158,7 +157,6 @@ static struct starpu_codelet cpu_cl =
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static struct starpu_codelet cuda_cl =
			
 
				 {
			
 
				-	.where = STARPU_CUDA,
			
 
				 	.cuda_funcs = { custom_scal_cuda_func, NULL },
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = { STARPU_RW },
			
@@ -171,7 +169,6 @@ extern void custom_scal_opencl_func(void *buffers[], void *args);
 
				 
			
 
				 static struct starpu_codelet opencl_cl =
			
 
				 {
			
 
				-	.where = STARPU_OPENCL,
			
 
				 	.opencl_funcs = { custom_scal_opencl_func, NULL },
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = { STARPU_RW },
			
--- a/examples/filters/fblock.c
+++ b/examples/filters/fblock.c
@@ -91,7 +91,6 @@ int main(int argc, char **argv)
 
				 	starpu_data_handle_t handle;
			
 
				 	struct starpu_codelet cl =
			
 
				 	{
			
 
				-                .where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				                 .cpu_funcs = {cpu_func, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				                 .cuda_funcs = {cuda_func, NULL},
			
@@ -121,7 +120,7 @@ int main(int argc, char **argv)
 
				         /* Partition the block in PARTS sub-blocks */
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func_block,
			
 
				+		.filter_func = starpu_block_filter_block,
			
 
				 		.nchildren = PARTS
			
 
				 	};
			
 
				         starpu_data_partition(handle, &f);
			
--- a/examples/filters/fmatrix.c
+++ b/examples/filters/fmatrix.c
@@ -62,7 +62,6 @@ int main(int argc, char **argv)
 
				         starpu_data_handle_t handle;
			
 
				         struct starpu_codelet cl =
			
 
				 	{
			
 
				-                .where = STARPU_CPU,
			
 
				                 .cpu_funcs = {cpu_func, NULL},
			
 
				                 .nbuffers = 1,
			
 
				 		.modes = {STARPU_RW}
			
@@ -79,7 +78,7 @@ int main(int argc, char **argv)
 
				         /* Partition the matrix in PARTS sub-matrices */
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = PARTS
			
 
				 	};
			
 
				 	starpu_data_partition(handle, &f);
			
--- a/examples/filters/fvector.c
+++ b/examples/filters/fvector.c
@@ -45,7 +45,6 @@ int main(int argc, char **argv)
 
				 
			
 
				         struct starpu_codelet cl =
			
 
				 	{
			
 
				-                .where = STARPU_CPU,
			
 
				                 .cpu_funcs = {cpu_func, NULL},
			
 
				                 .nbuffers = 1,
			
 
				 		.modes = {STARPU_RW}
			
@@ -67,7 +66,7 @@ int main(int argc, char **argv)
 
				         /* Partition the vector in PARTS sub-vectors */
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func_vector,
			
 
				+		.filter_func = starpu_vector_filter_block,
			
 
				 		.nchildren = PARTS
			
 
				 	};
			
 
				 	starpu_data_partition(handle, &f);
			
--- a/examples/filters/shadow.c
+++ b/examples/filters/shadow.c
@@ -99,11 +99,6 @@ int main(int argc, char **argv)
 
				 
			
 
				         struct starpu_codelet cl =
			
 
				 	{
			
 
				-                .where = STARPU_CPU
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-			|STARPU_CUDA
			
 
				-#endif
			
 
				-			,
			
 
				                 .cpu_funcs = {cpu_func, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				                 .cuda_funcs = {cuda_func, NULL},
			
@@ -136,7 +131,7 @@ int main(int argc, char **argv)
 
				 	 * combined. */
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_shadow_filter_func_vector,
			
 
				+		.filter_func = starpu_vector_filter_block_shadow,
			
 
				 		.nchildren = PARTS,
			
 
				 		.filter_arg_ptr = (void*)(uintptr_t) SHADOW /* Shadow width */
			
 
				 	};
			
@@ -145,7 +140,7 @@ int main(int argc, char **argv)
 
				         /* Partition the destination vector in PARTS sub-vectors */
			
 
				 	struct starpu_data_filter f2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func_vector,
			
 
				+		.filter_func = starpu_vector_filter_block,
			
 
				 		.nchildren = PARTS,
			
 
				 	};
			
 
				 	starpu_data_partition(handle2, &f2);
			
--- a/examples/filters/shadow2d.c
+++ b/examples/filters/shadow2d.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -153,11 +153,6 @@ int main(int argc, char **argv)
 
				 
			
 
				         struct starpu_codelet cl =
			
 
				 	{
			
 
				-                .where = STARPU_CPU
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-			|STARPU_CUDA
			
 
				-#endif
			
 
				-			,
			
 
				                 .cpu_funcs = {cpu_func, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				                 .cuda_funcs = {cuda_func, NULL},
			
@@ -217,13 +212,13 @@ int main(int argc, char **argv)
 
				 	 * combined. */
			
 
				 	struct starpu_data_filter fy =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_shadow_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block_shadow,
			
 
				 		.nchildren = PARTSY,
			
 
				 		.filter_arg_ptr = (void*)(uintptr_t) SHADOWY /* Shadow width */
			
 
				 	};
			
 
				 	struct starpu_data_filter fx =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_shadow_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block_shadow,
			
 
				 		.nchildren = PARTSX,
			
 
				 		.filter_arg_ptr = (void*)(uintptr_t) SHADOWX /* Shadow width */
			
 
				 	};
			
@@ -232,12 +227,12 @@ int main(int argc, char **argv)
 
				         /* Partition the destination matrix in PARTSY*PARTSX sub-matrices */
			
 
				 	struct starpu_data_filter fy2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = PARTSY,
			
 
				 	};
			
 
				 	struct starpu_data_filter fx2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = PARTSX,
			
 
				 	};
			
 
				 	starpu_data_map_filters(handle2, 2, &fy2, &fx2);
			
--- a/examples/filters/shadow3d.c
+++ b/examples/filters/shadow3d.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -120,11 +120,6 @@ int main(int argc, char **argv)
 
				 
			
 
				         struct starpu_codelet cl =
			
 
				 	{
			
 
				-                .where = STARPU_CPU
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-			|STARPU_CUDA
			
 
				-#endif
			
 
				-			,
			
 
				                 .cpu_funcs = {cpu_func, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				                 .cuda_funcs = {cuda_func, NULL},
			
@@ -235,19 +230,19 @@ int main(int argc, char **argv)
 
				 	 * combined. */
			
 
				 	struct starpu_data_filter fz =
			
 
				 	{
			
 
				-		.filter_func = starpu_depth_block_shadow_filter_func_block,
			
 
				+		.filter_func = starpu_block_filter_depth_block_shadow,
			
 
				 		.nchildren = PARTSZ,
			
 
				 		.filter_arg_ptr = (void*)(uintptr_t) SHADOWZ /* Shadow width */
			
 
				 	};
			
 
				 	struct starpu_data_filter fy =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_shadow_filter_func_block,
			
 
				+		.filter_func = starpu_block_filter_vertical_block_shadow,
			
 
				 		.nchildren = PARTSY,
			
 
				 		.filter_arg_ptr = (void*)(uintptr_t) SHADOWY /* Shadow width */
			
 
				 	};
			
 
				 	struct starpu_data_filter fx =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_shadow_filter_func_block,
			
 
				+		.filter_func = starpu_block_filter_block_shadow,
			
 
				 		.nchildren = PARTSX,
			
 
				 		.filter_arg_ptr = (void*)(uintptr_t) SHADOWX /* Shadow width */
			
 
				 	};
			
@@ -256,17 +251,17 @@ int main(int argc, char **argv)
 
				         /* Partition the destination matrix in PARTSZ*PARTSY*PARTSX sub-matrices */
			
 
				 	struct starpu_data_filter fz2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_depth_block_filter_func_block,
			
 
				+		.filter_func = starpu_block_filter_depth_block,
			
 
				 		.nchildren = PARTSZ,
			
 
				 	};
			
 
				 	struct starpu_data_filter fy2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func_block,
			
 
				+		.filter_func = starpu_block_filter_vertical_block,
			
 
				 		.nchildren = PARTSY,
			
 
				 	};
			
 
				 	struct starpu_data_filter fx2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func_block,
			
 
				+		.filter_func = starpu_block_filter_block,
			
 
				 		.nchildren = PARTSX,
			
 
				 	};
			
 
				 	starpu_data_map_filters(handle2, 3, &fz2, &fy2, &fx2);
			
--- a/examples/gl_interop/gl_interop.c
+++ b/examples/gl_interop/gl_interop.c
@@ -39,7 +39,6 @@ void dummy(void *buffers[], void *cl_arg)
 
				 }
			
 
				 
			
 
				 struct starpu_codelet cl = {
			
 
				-	.where = STARPU_CUDA,
			
 
				 	.cuda_funcs = { dummy, NULL },
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = { STARPU_W },
			
--- a/examples/gl_interop/gl_interop_idle.c
+++ b/examples/gl_interop/gl_interop_idle.c
@@ -42,7 +42,6 @@ void dummy(void *buffers[], void *cl_arg)
 
				 }
			
 
				 
			
 
				 struct starpu_codelet cl = {
			
 
				-	.where = STARPU_CUDA,
			
 
				 	.cuda_funcs = { dummy, NULL },
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = { STARPU_W },
			
--- a/examples/heat/dw_factolu.c
+++ b/examples/heat/dw_factolu.c
@@ -36,7 +36,6 @@ static unsigned no_prio = 0;
 
				 
			
 
				 static struct starpu_codelet cl11 =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u11, NULL},
			
@@ -48,7 +47,6 @@ static struct starpu_codelet cl11 =
 
				 
			
 
				 static struct starpu_codelet cl12 =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u12, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u12, NULL},
			
@@ -60,7 +58,6 @@ static struct starpu_codelet cl12 =
 
				 
			
 
				 static struct starpu_codelet cl21 =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u21, NULL},
			
@@ -72,7 +69,6 @@ static struct starpu_codelet cl21 =
 
				 
			
 
				 static struct starpu_codelet cl22 =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u22, NULL},
			
@@ -705,7 +701,7 @@ void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
 
				 		exit(77);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	if (pinned)
			
 
				 	{
			
@@ -759,13 +755,13 @@ void dw_factoLU(float *matA, unsigned size,
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				 	struct starpu_data_filter f2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
--- a/examples/heat/dw_factolu_grain.c
+++ b/examples/heat/dw_factolu_grain.c
@@ -45,7 +45,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				 static struct starpu_codelet cl11 =
			
 
				 {
			
 
				 	.modes = { STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u11, NULL},
			
@@ -80,7 +79,6 @@ static struct starpu_task *create_task_11(starpu_data_handle_t dataA, unsigned k
 
				 static struct starpu_codelet cl12 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u12, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u12, NULL},
			
@@ -125,7 +123,6 @@ static void create_task_12(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 
				 static struct starpu_codelet cl21 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u21, NULL},
			
@@ -167,7 +164,6 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, u
 
				 static struct starpu_codelet cl22 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u22, NULL},
			
@@ -227,13 +223,13 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				 	struct starpu_data_filter f2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
--- a/examples/heat/dw_factolu_tag.c
+++ b/examples/heat/dw_factolu_tag.c
@@ -47,7 +47,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
				 static struct starpu_codelet cl11 =
			
 
				 {
			
 
				 	.modes = { STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u11, NULL},
			
@@ -83,7 +82,6 @@ static struct starpu_task *create_task_11(starpu_data_handle_t dataA, unsigned k
 
				 static struct starpu_codelet cl12 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u12, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u12, NULL},
			
@@ -128,7 +126,6 @@ static void create_task_12(starpu_data_handle_t dataA, unsigned k, unsigned i)
 
				 static struct starpu_codelet cl21 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u21, NULL},
			
@@ -170,7 +167,6 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 
				 static struct starpu_codelet cl22 =
			
 
				 {
			
 
				 	.modes = { STARPU_R, STARPU_R, STARPU_RW },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {dw_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dw_cublas_codelet_update_u22, NULL},
			
@@ -305,13 +301,13 @@ void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				 	struct starpu_data_filter f2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
--- a/examples/heat/dw_sparse_cg.c
+++ b/examples/heat/dw_sparse_cg.c
@@ -431,7 +431,7 @@ void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz
 
				 		exit(77);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	conjugate_gradient(nzvalA, vecb, vecx, nnz, nrow, colind, rowptr);
			
 
				 }
			
--- a/examples/heat/heat.c
+++ b/examples/heat/heat.c
@@ -788,7 +788,7 @@ int main(int argc, char **argv)
 
				 		if (check)
			
 
				 			solve_system(DIM, newsize, result, RefArray, Bformer, A, B);
			
 
				 
			
 
				-		starpu_helper_cublas_shutdown();
			
 
				+		starpu_cublas_shutdown();
			
 
				 		starpu_shutdown();
			
 
				 		free_system(A, B, newsize, pinned);
			
 
				 	}
			
--- a/examples/incrementer/incrementer.c
+++ b/examples/incrementer/incrementer.c
@@ -66,7 +66,6 @@ int main(int argc, char **argv)
 
				 
			
 
				 	struct starpu_codelet cl =
			
 
				 	{
			
 
				-		.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 		.cpu_funcs = {cpu_codelet, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		.cuda_funcs = {cuda_codelet, NULL},
			
--- a/examples/interface/complex_interface.c
+++ b/examples/interface/complex_interface.c
@@ -66,10 +66,10 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, unsign
 
				 	double *addr_imaginary = 0;
			
 
				 	ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
			
 
				 
			
 
				-	addr_real = (double*) starpu_allocate_buffer_on_node(node, requested_memory);
			
 
				+	addr_real = (double*) starpu_malloc_on_node(node, requested_memory);
			
 
				 	if (!addr_real)
			
 
				 		goto fail_real;
			
 
				-	addr_imaginary = (double*) starpu_allocate_buffer_on_node(node, requested_memory);
			
 
				+	addr_imaginary = (double*) starpu_malloc_on_node(node, requested_memory);
			
 
				 	if (!addr_imaginary)
			
 
				 		goto fail_imaginary;
			
 
				 
			
@@ -80,7 +80,7 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, unsign
 
				 	return 2*requested_memory;
			
 
				 
			
 
				 fail_imaginary:
			
 
				-	starpu_free_buffer_on_node(node, (uintptr_t) addr_real, requested_memory);
			
 
				+	starpu_free_on_node(node, (uintptr_t) addr_real, requested_memory);
			
 
				 fail_real:
			
 
				 	return -ENOMEM;
			
 
				 }
			
@@ -90,8 +90,8 @@ static void complex_free_data_on_node(void *data_interface, unsigned node)
 
				 	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
			
 
				 	ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
			
 
				 
			
 
				-	starpu_free_buffer_on_node(node, (uintptr_t) complex_interface->real, requested_memory);
			
 
				-	starpu_free_buffer_on_node(node, (uintptr_t) complex_interface->imaginary, requested_memory);
			
 
				+	starpu_free_on_node(node, (uintptr_t) complex_interface->real, requested_memory);
			
 
				+	starpu_free_on_node(node, (uintptr_t) complex_interface->imaginary, requested_memory);
			
 
				 }
			
 
				 
			
 
				 static size_t complex_get_size(starpu_data_handle_t handle)
			
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -310,7 +310,7 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	init_matrix();
			
 
				 
			
@@ -414,7 +414,7 @@ int main(int argc, char **argv)
 
				 	starpu_free(A);
			
 
				 
			
 
				 	FPRINTF(stderr, "Shutting down\n");
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/examples/lu/xlu.c
+++ b/examples/lu/xlu.c
@@ -256,13 +256,13 @@ int STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				 	struct starpu_data_filter f2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
--- a/examples/lu/xlu_implicit.c
+++ b/examples/lu/xlu_implicit.c
@@ -156,13 +156,13 @@ int STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				 	struct starpu_data_filter f2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
--- a/examples/lu/xlu_implicit_pivot.c
+++ b/examples/lu/xlu_implicit_pivot.c
@@ -210,13 +210,13 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				 	struct starpu_data_filter f2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
--- a/examples/lu/xlu_pivot.c
+++ b/examples/lu/xlu_pivot.c
@@ -345,13 +345,13 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_vertical_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
 
				 	struct starpu_data_filter f2 =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func,
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				 		.nchildren = nblocks
			
 
				 	};
			
 
				 
			
--- a/examples/mandelbrot/mandelbrot.c
+++ b/examples/mandelbrot/mandelbrot.c
@@ -373,7 +373,6 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
				 
			
 
				 static struct starpu_codelet spmd_mandelbrot_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_OPENCL,
			
 
				 	.type = STARPU_SPMD,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				 	.cpu_funcs = {compute_block_spmd, NULL},
			
@@ -385,7 +384,6 @@ static struct starpu_codelet spmd_mandelbrot_cl =
 
				 
			
 
				 static struct starpu_codelet mandelbrot_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_OPENCL,
			
 
				 	.type = STARPU_SEQ,
			
 
				 	.cpu_funcs = {compute_block, NULL},
			
 
				 #ifdef STARPU_USE_OPENCL
			
--- a/examples/matvecmult/matvecmult.c
+++ b/examples/matvecmult/matvecmult.c
@@ -129,7 +129,6 @@ static struct starpu_perfmodel starpu_matvecmult_model =
 
				 
			
 
				 static struct starpu_codelet cl =
			
 
				 {
			
 
				-	.where = STARPU_OPENCL,
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         .opencl_funcs[0] = opencl_codelet,
			
 
				 #endif
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -119,12 +119,12 @@ static void partition_mult_data(void)
 
				 
			
 
				 	struct starpu_data_filter vert;
			
 
				 	memset(&vert, 0, sizeof(vert));
			
 
				-	vert.filter_func = starpu_vertical_block_filter_func;
			
 
				+	vert.filter_func = starpu_matrix_filter_vertical_block;
			
 
				 	vert.nchildren = nslicesx;
			
 
				 
			
 
				 	struct starpu_data_filter horiz;
			
 
				 	memset(&horiz, 0, sizeof(horiz));
			
 
				-	horiz.filter_func = starpu_block_filter_func;
			
 
				+	horiz.filter_func = starpu_matrix_filter_block;
			
 
				 	horiz.nchildren = nslicesy;
			
 
				 
			
 
				 	starpu_data_partition(B_handle, &vert);
			
@@ -202,7 +202,6 @@ static struct starpu_perfmodel starpu_gemm_model =
 
				 
			
 
				 static struct starpu_codelet cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
			
 
				 	.max_parallelism = INT_MAX,
			
 
				 	.cpu_funcs = {cpu_mult, NULL},
			
@@ -297,7 +296,7 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	init_problem_data();
			
 
				 	partition_mult_data();
			
@@ -357,7 +356,7 @@ enodev:
 
				 	starpu_free(B);
			
 
				 	starpu_free(C);
			
 
				 
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return ret;
			
--- a/examples/openmp/vector_scal.c
+++ b/examples/openmp/vector_scal.c
@@ -57,7 +57,6 @@ static struct starpu_perfmodel vector_scal_model =
 
				 static struct starpu_codelet cl =
			
 
				 {
			
 
				 	.modes = { STARPU_RW },
			
 
				-	.where = STARPU_CPU,
			
 
				 	.type = STARPU_FORKJOIN,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				 	.cpu_funcs = {scal_cpu_func, NULL},
			
--- a/examples/pi/pi.c
+++ b/examples/pi/pi.c
@@ -114,7 +114,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	struct starpu_data_filter f =
			
 
				 	{
			
 
				-		.filter_func = starpu_block_filter_func_vector,
			
 
				+		.filter_func = starpu_vector_filter_block,
			
 
				 		.nchildren = ntasks
			
 
				 	};
			
 
				 	
			
@@ -129,7 +129,6 @@ int main(int argc, char **argv)
 
				 
			
 
				 	struct starpu_codelet cl =
			
 
				 	{
			
 
				-		.where = STARPU_CPU|STARPU_CUDA,
			
 
				 		.cpu_funcs = {cpu_kernel, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 		.cuda_funcs = {cuda_kernel, NULL},
			
--- a/examples/pi/pi_redux.c
+++ b/examples/pi/pi_redux.c
@@ -189,11 +189,6 @@ static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
 
				 
			
 
				 static struct starpu_codelet pi_cl =
			
 
				 {
			
 
				-	.where =
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-		STARPU_CUDA|
			
 
				-#endif
			
 
				-		STARPU_CPU,
			
 
				 	.cpu_funcs = {pi_func_cpu, NULL},
			
 
				 #ifdef STARPU_HAVE_CURAND
			
 
				 	.cuda_funcs = {pi_func_cuda, NULL},
			
@@ -205,11 +200,6 @@ static struct starpu_codelet pi_cl =
 
				 
			
 
				 static struct starpu_codelet pi_cl_redux =
			
 
				 {
			
 
				-	.where =
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-		STARPU_CUDA|
			
 
				-#endif
			
 
				-		STARPU_CPU,
			
 
				 	.cpu_funcs = {pi_func_cpu, NULL},
			
 
				 #ifdef STARPU_HAVE_CURAND
			
 
				 	.cuda_funcs = {pi_func_cuda, NULL},
			
@@ -240,11 +230,6 @@ static void init_cuda_func(void *descr[], void *cl_arg)
 
				 
			
 
				 static struct starpu_codelet init_codelet =
			
 
				 {
			
 
				-	.where =
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-		STARPU_CUDA|
			
 
				-#endif
			
 
				-		STARPU_CPU,
			
 
				         .cpu_funcs = {init_cpu_func, NULL},
			
 
				 #ifdef STARPU_HAVE_CURAND
			
 
				         .cuda_funcs = {init_cuda_func, NULL},
			
@@ -282,11 +267,6 @@ static void redux_cpu_func(void *descr[], void *cl_arg)
 
				 
			
 
				 static struct starpu_codelet redux_codelet =
			
 
				 {
			
 
				-	.where =
			
 
				-#ifdef STARPU_HAVE_CURAND
			
 
				-		STARPU_CUDA|
			
 
				-#endif
			
 
				-		STARPU_CPU,
			
 
				 	.cpu_funcs = {redux_cpu_func, NULL},
			
 
				 #ifdef STARPU_HAVE_CURAND
			
 
				 	.cuda_funcs = {redux_cuda_func, NULL},
			
--- a/examples/pipeline/pipeline.c
+++ b/examples/pipeline/pipeline.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2012  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -77,7 +77,6 @@ static struct starpu_perfmodel pipeline_model_x =
 
				 
			
 
				 static struct starpu_codelet pipeline_codelet_x =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {pipeline_cpu_x, NULL},
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_W},
			
@@ -113,11 +112,6 @@ static struct starpu_perfmodel pipeline_model_axpy =
 
				 
			
 
				 static struct starpu_codelet pipeline_codelet_axpy =
			
 
				 {
			
 
				-	.where = STARPU_CPU
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-		| STARPU_CUDA
			
 
				-#endif
			
 
				-		,
			
 
				 	.cpu_funcs = {pipeline_cpu_axpy, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {pipeline_cublas_axpy, NULL},
			
@@ -160,11 +154,6 @@ static struct starpu_perfmodel pipeline_model_sum =
 
				 
			
 
				 static struct starpu_codelet pipeline_codelet_sum =
			
 
				 {
			
 
				-	.where = STARPU_CPU
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-		| STARPU_CUDA
			
 
				-#endif
			
 
				-		,
			
 
				 	.cpu_funcs = {pipeline_cpu_sum, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {pipeline_cublas_sum, NULL},
			
@@ -186,7 +175,7 @@ int main(void)
 
				 		exit(77);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	/* Initialize the K temporary buffers. No need to allocate it ourselves
			
 
				 	 * Since it's the X and Y kernels which will fill the initial values. */
			
--- a/examples/ppm_downscaler/yuv_downscaler.c
+++ b/examples/ppm_downscaler/yuv_downscaler.c
@@ -86,7 +86,6 @@ static void ds_kernel_cpu(void *descr[], __attribute__((unused)) void *arg)
 
				 
			
 
				 static struct starpu_codelet ds_codelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {ds_kernel_cpu, NULL},
			
 
				 	.nbuffers = 2, /* input -> output */
			
 
				 	.modes = {STARPU_R, STARPU_W},
			
@@ -96,13 +95,13 @@ static struct starpu_codelet ds_codelet =
 
				 /* each block contains BLOCK_HEIGHT consecutive lines */
			
 
				 static struct starpu_data_filter filter_y =
			
 
				 {
			
 
				-	.filter_func = starpu_block_filter_func,
			
 
				+	.filter_func = starpu_matrix_filter_block,
			
 
				 	.nchildren= HEIGHT/BLOCK_HEIGHT
			
 
				 };
			
 
				 
			
 
				 static struct starpu_data_filter filter_uv =
			
 
				 {
			
 
				-	.filter_func = starpu_block_filter_func,
			
 
				+	.filter_func = starpu_matrix_filter_block,
			
 
				 	.nchildren = (HEIGHT/2)/BLOCK_HEIGHT
			
 
				 };
			
 
				 
			
--- a/examples/profiling/profiling.c
+++ b/examples/profiling/profiling.c
@@ -50,7 +50,6 @@ int main(int argc, char **argv)
 
				 
			
 
				 	struct starpu_codelet cl =
			
 
				 	{
			
 
				-		.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 		.cpu_funcs = {sleep_codelet, NULL},
			
 
				 		.cuda_funcs = {sleep_codelet, NULL},
			
 
				 		.opencl_funcs = {sleep_codelet, NULL},
			
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -333,7 +333,7 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
			
 
				 #endif
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	unsigned long nelems = nblocks*entries_per_block;
			
 
				 	size_t size = nelems*sizeof(float);
			
@@ -400,7 +400,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
			
 
				 
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				         ret = starpu_opencl_unload_opencl(&opencl_program);
			
--- a/examples/reductions/minmax_reduction.c
+++ b/examples/reductions/minmax_reduction.c
@@ -57,7 +57,6 @@ static void minmax_neutral_cpu_func(void *descr[], void *cl_arg)
 
				 
			
 
				 static struct starpu_codelet minmax_init_codelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {minmax_neutral_cpu_func, NULL},
			
 
				 	.nbuffers = 1
			
 
				 };
			
@@ -84,7 +83,6 @@ void minmax_redux_cpu_func(void *descr[], void *cl_arg)
 
				 
			
 
				 static struct starpu_codelet minmax_redux_codelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {minmax_redux_cpu_func, NULL},
			
 
				 	.nbuffers = 2
			
 
				 };
			
@@ -119,7 +117,6 @@ void minmax_cpu_func(void *descr[], void *cl_arg)
 
				 
			
 
				 static struct starpu_codelet minmax_codelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {minmax_cpu_func, NULL},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_REDUX}
			
--- a/examples/sched_ctx/sched_ctx.c
+++ b/examples/sched_ctx/sched_ctx.c
@@ -32,7 +32,6 @@ static void sched_ctx_func(void *descr[] __attribute__ ((unused)), void *arg __a
 
				 
			
 
				 static struct starpu_codelet sched_ctx_codelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_funcs = {sched_ctx_func, NULL},
			
 
				 	.cuda_funcs = {sched_ctx_func, NULL},
			
 
				 	.opencl_funcs = {sched_ctx_func, NULL},
			
--- a/examples/scheduler/dummy_sched.c
+++ b/examples/scheduler/dummy_sched.c
@@ -133,7 +133,6 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
				 
			
 
				 static struct starpu_codelet dummy_codelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_funcs = {dummy_func, NULL},
			
 
				 	.cuda_funcs = {dummy_func, NULL},
			
 
				         .opencl_funcs = {dummy_func, NULL},
			
--- a/examples/spmd/vector_scal_spmd.c
+++ b/examples/spmd/vector_scal_spmd.c
@@ -81,7 +81,6 @@ static struct starpu_perfmodel vector_scal_model =
 
				 static struct starpu_codelet cl =
			
 
				 {
			
 
				 	.modes = { STARPU_RW },
			
 
				-	.where = STARPU_CPU,
			
 
				 	.type = STARPU_SPMD,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				 	.cpu_funcs = {scal_cpu_func, NULL},
			
--- a/examples/spmv/dw_block_spmv.c
+++ b/examples/spmv/dw_block_spmv.c
@@ -121,17 +121,17 @@ void call_filters(void)
 
				 	struct starpu_data_filter bcsr_f;
			
 
				 	struct starpu_data_filter vector_in_f, vector_out_f;
			
 
				 
			
 
				-	bcsr_f.filter_func    = starpu_canonical_block_filter_bcsr;
			
 
				+	bcsr_f.filter_func    = starpu_bcsr_filter_canonical_block;
			
 
				 	bcsr_f.get_nchildren = get_bcsr_nchildren;
			
 
				 	/* the children use a matrix interface ! */
			
 
				 	bcsr_f.get_child_ops = get_bcsr_child_ops;
			
 
				 
			
 
				-	vector_in_f.filter_func = starpu_block_filter_func_vector;
			
 
				+	vector_in_f.filter_func = starpu_vector_filter_block;
			
 
				 	vector_in_f.nchildren  = size/c;
			
 
				 	vector_in_f.get_nchildren  = NULL;
			
 
				 	vector_in_f.get_child_ops  = NULL;
			
 
				 	
			
 
				-	vector_out_f.filter_func = starpu_block_filter_func_vector;
			
 
				+	vector_out_f.filter_func = starpu_vector_filter_block;
			
 
				 	vector_out_f.nchildren  = size/r;
			
 
				 	vector_out_f.get_nchildren  = NULL;
			
 
				 	vector_out_f.get_child_ops  = NULL;
			
@@ -147,7 +147,6 @@ unsigned totaltasks;
 
				 
			
 
				 struct starpu_codelet cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = { cpu_block_spmv, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {cublas_block_spmv, NULL},
			
--- a/examples/spmv/spmv.c
+++ b/examples/spmv/spmv.c
@@ -88,14 +88,13 @@ static struct starpu_data_filter csr_f =
 
				 
			
 
				 static struct starpu_data_filter vector_f =
			
 
				 {
			
 
				-	.filter_func = starpu_block_filter_func_vector,
			
 
				+	.filter_func = starpu_vector_filter_block,
			
 
				 	/* This value is defined later on */
			
 
				 	.nchildren = -1,
			
 
				 };
			
 
				 
			
 
				 static struct starpu_codelet spmv_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_funcs = {spmv_kernel_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {spmv_kernel_cuda, NULL},
			
--- a/examples/stencil/stencil-kernels.c
+++ b/examples/stencil/stencil-kernels.c
@@ -456,14 +456,6 @@ static struct starpu_perfmodel cl_update_model =
 
				 
			
 
				 struct starpu_codelet cl_update =
			
 
				 {
			
 
				-	.where = 0 |
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-		STARPU_CUDA|
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-                STARPU_OPENCL|
			
 
				-#endif
			
 
				-		STARPU_CPU,
			
 
				 	.cpu_funcs = {update_func_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {update_func_cuda, NULL},
			
@@ -664,14 +656,6 @@ static struct starpu_perfmodel save_cl_top_model =
 
				 
			
 
				 struct starpu_codelet save_cl_bottom =
			
 
				 {
			
 
				-	.where = 0 |
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-		STARPU_CUDA|
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-		STARPU_OPENCL|
			
 
				-#endif
			
 
				-		STARPU_CPU,
			
 
				 	.cpu_funcs = {dummy_func_bottom_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dummy_func_bottom_cuda, NULL},
			
@@ -686,14 +670,6 @@ struct starpu_codelet save_cl_bottom =
 
				 
			
 
				 struct starpu_codelet save_cl_top =
			
 
				 {
			
 
				-	.where = 0|
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-		STARPU_CUDA|
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-		STARPU_OPENCL|
			
 
				-#endif
			
 
				-		STARPU_CPU,
			
 
				 	.cpu_funcs = {dummy_func_top_cpu, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {dummy_func_top_cuda, NULL},
			
--- a/examples/stencil/stencil-tasks.c
+++ b/examples/stencil/stencil-tasks.c
@@ -217,7 +217,6 @@ static void null_func(void *descr[] __attribute__((unused)), void *arg __attribu
 
				 static struct starpu_codelet null =
			
 
				 {
			
 
				 	.modes = { STARPU_W, STARPU_W },
			
 
				-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
			
 
				 	.cpu_funcs = {null_func, NULL},
			
 
				 	.cuda_funcs = {null_func, NULL},
			
 
				 	.opencl_funcs = {null_func, NULL},
			
--- a/examples/top/hello_world_top.c
+++ b/examples/top/hello_world_top.c
@@ -100,7 +100,6 @@ struct starpu_codelet cl =
 
				 {
			
 
				 	/* this codelet may only be executed on a CPU, and its cpu
			
 
				  	 * implementation is function "cpu_func" */
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {cpu_func, NULL},
			
 
				 	/* the codelet does not manipulate any data that is managed
			
 
				 	 * by our DSM */
			
--- a/gcc-plugin/examples/cholesky/cholesky.c
+++ b/gcc-plugin/examples/cholesky/cholesky.c
@@ -111,7 +111,7 @@ int main(int argc, char **argv)
 
				 //	conf.calibrate = 1;
			
 
				 #pragma starpu initialize
			
 
				 
			
 
				-        starpu_helper_cublas_init();
			
 
				+        starpu_cublas_init();
			
 
				 
			
 
				 	float bmat[nblocks][nblocks][BLOCKSIZE * BLOCKSIZE] __heap;
			
 
				 
			
@@ -247,7 +247,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				         }
			
 
				 
			
 
				-        starpu_helper_cublas_shutdown();
			
 
				+        starpu_cublas_shutdown();
			
 
				 #pragma starpu shutdown
			
 
				 
			
 
				 	assert(correctness);
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -43,6 +43,7 @@ typedef unsigned long long uint64_t;
 
				 #include <starpu_data.h>
			
 
				 #include <starpu_data_interfaces.h>
			
 
				 #include <starpu_data_filters.h>
			
 
				+#include <starpu_stdlib.h>
			
 
				 #include <starpu_perfmodel.h>
			
 
				 #include <starpu_worker.h>
			
 
				 #include <starpu_task.h>
			
@@ -62,6 +63,7 @@ typedef unsigned long long uint64_t;
 
				 #include <starpu_profiling.h>
			
 
				 #include <starpu_top.h>
			
 
				 #include <starpu_fxt.h>
			
 
				+#include <starpu_driver.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 extern "C"
			
@@ -72,31 +74,6 @@ extern "C"
 
				 #define main starpu_main
			
 
				 #endif
			
 
				 
			
 
				-struct starpu_driver
			
 
				-{
			
 
				-	enum starpu_archtype type;
			
 
				-	union
			
 
				-	{
			
 
				-		unsigned cpu_id;
			
 
				-		unsigned cuda_id;
			
 
				-#if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
			
 
				-		cl_device_id opencl_id;
			
 
				-#elif defined(STARPU_SIMGRID)
			
 
				-		unsigned opencl_id;
			
 
				-#endif
			
 
				-		/*
			
 
				-		 * HOWTO: add a new kind of device to the starpu_driver structure.
			
 
				-		 * 1) Add a member to this union.
			
 
				-		 * 2) Edit _starpu_launch_drivers() to make sure the driver is
			
 
				-		 *    not always launched.
			
 
				-		 * 3) Edit starpu_driver_run() so that it can handle another
			
 
				-		 *    kind of architecture.
			
 
				-		 * 4) Write _starpu_run_foobar() in the corresponding driver.
			
 
				-		 * 5) Test the whole thing :)
			
 
				-		 */
			
 
				-	} id;
			
 
				-};
			
 
				-
			
 
				 struct starpu_conf
			
 
				 {
			
 
				 	/* Will be initialized by starpu_conf_init */
			
@@ -173,18 +150,11 @@ int starpu_asynchronous_opencl_copy_disabled(void);
 
				 
			
 
				 void starpu_profiling_init();
			
 
				 void starpu_display_stats();
			
 
				-int starpu_driver_run(struct starpu_driver *d);
			
 
				-void starpu_drivers_request_termination(void);
			
 
				 
			
 
				-int starpu_driver_init(struct starpu_driver *d);
			
 
				-int starpu_driver_run_once(struct starpu_driver *d);
			
 
				-int starpu_driver_deinit(struct starpu_driver *d);
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-#if defined(STARPU_USE_DEPRECATED_API)
			
 
				 #include "starpu_deprecated_api.h"
			
 
				-#endif /* STARPU_USE_DEPRECATED_API */
			
 
				 
			
 
				 #endif /* __STARPU_H__ */
			
--- a/include/starpu_cublas.h
+++ b/include/starpu_cublas.h
@@ -23,8 +23,8 @@ extern "C"
 
				 {
			
 
				 #endif
			
 
				 /* Some helper functions for application using CUBLAS kernels */
			
 
				-void starpu_helper_cublas_init(void);
			
 
				-void starpu_helper_cublas_shutdown(void);
			
 
				+void starpu_cublas_init(void);
			
 
				+void starpu_cublas_shutdown(void);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -85,9 +85,6 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, e
 
				 void starpu_data_release(starpu_data_handle_t handle);
			
 
				 void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
			
 
				 
			
 
				-void starpu_malloc_set_align(size_t align);
			
 
				-int starpu_malloc(void **A, size_t dim);
			
 
				-int starpu_free(void *A);
			
 
				 void starpu_memory_display_stats();
			
 
				 
			
 
				 /* XXX These macros are provided to avoid breaking old codes. But consider
			
--- a/include/starpu_data_filters.h
+++ b/include/starpu_data_filters.h
@@ -58,28 +58,28 @@ void starpu_data_vmap_filters(starpu_data_handle_t root_data, unsigned nfilters,
 
				 /* a few examples of filters */
			
 
				 
			
 
				 /* for BCSR */
			
 
				-void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_csr_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				 
			
 
				 /* (filters for matrix interface) */
			
 
				-void starpu_block_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_block_shadow_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_vertical_block_shadow_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_matrix_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_matrix_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_matrix_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_matrix_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				 
			
 
				 /* for vector */
			
 
				-void starpu_block_filter_func_vector(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_block_shadow_filter_func_vector(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_vector_list_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_vector_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_vector_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_vector_filter_list(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				 
			
 
				 /* for block */
			
 
				-void starpu_block_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_block_shadow_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_vertical_block_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_vertical_block_shadow_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_depth_block_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				-void starpu_depth_block_shadow_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_block_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_block_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_block_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_block_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_block_filter_depth_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				+void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -135,11 +135,6 @@ int starpu_data_interface_get_next_id(void);
 
				 void starpu_data_register(starpu_data_handle_t *handleptr, unsigned home_node, void *data_interface, struct starpu_data_interface_ops *ops);
			
 
				 void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc);
			
 
				 
			
 
				-/* Allocate SIZE bytes on node NODE */
			
 
				-uintptr_t starpu_allocate_buffer_on_node(unsigned dst_node, size_t size);
			
 
				-/* Free ADDR on node NODE */
			
 
				-void starpu_free_buffer_on_node(unsigned dst_node, uintptr_t addr, size_t size);
			
 
				-
			
 
				 /* Return the pointer associated with HANDLE on node NODE or NULL if HANDLE's
			
 
				  * interface does not support this operation or data for this handle is not
			
 
				  * allocated on that node. */
			
--- a/include/starpu_deprecated_api.h
+++ b/include/starpu_deprecated_api.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,8 +23,11 @@ extern "C"
 
				 {
			
 
				 #endif
			
 
				 
			
 
				+#if defined(STARPU_USE_DEPRECATED_API) || defined(STARPU_USE_DEPRECATED_ONE_ZERO_API)
			
 
				 #warning Your application is using former types. You may want to update to use the latest API, by using tools/dev/rename.sh.
			
 
				+#endif /* defined(STARPU_USE_DEPRECATED_API) || defined(STARPU_USE_DEPRECATED_ONE_ZERO_API) */
			
 
				 
			
 
				+#ifdef STARPU_USE_DEPRECATED_API
			
 
				 typedef starpu_data_handle_t starpu_data_handle;
			
 
				 typedef struct starpu_block_interface starpu_block_interface_t;
			
 
				 typedef struct starpu_matrix_interface starpu_matrix_interface_t;
			
@@ -56,6 +59,37 @@ typedef enum starpu_access_mode starpu_access_mode;
 
				 #define starpu_pack_cl_args   	       starpu_codelet_pack_args
			
 
				 #define starpu_task_deinit	       starpu_task_clean
			
 
				 
			
 
				+#endif /* STARPU_USE_DEPRECATED_API */
			
 
				+
			
 
				+#ifdef STARPU_USE_DEPRECATED_ONE_ZERO_API
			
 
				+
			
 
				+#define starpu_allocate_buffer_on_node	starpu_malloc_on_node
			
 
				+#define starpu_free_buffer_on_node	starpu_free_on_node
			
 
				+#define starpu_helper_cublas_init	starpu_cublas_init
			
 
				+#define starpu_helper_cublas_shutdown	starpu_cublas_shutdown
			
 
				+
			
 
				+#define starpu_canonical_block_filter_bcsr	starpu_bcsr_filter_canonical_block
			
 
				+#define starpu_vertical_block_filter_func_csr	starpu_csr_filter_vertical_block
			
 
				+
			
 
				+#define starpu_block_filter_func			starpu_matrix_filter_block
			
 
				+#define starpu_block_shadow_filter_func			starpu_matrix_filter_block_shadow
			
 
				+#define starpu_vertical_block_filter_func		starpu_matrix_filter_vertical_block
			
 
				+#define starpu_vertical_block_shadow_filter_func	starpu_matrix_filter_vertical_block_shadow
			
 
				+
			
 
				+#define starpu_block_filter_func_vector		starpu_vector_filter_block
			
 
				+#define starpu_block_shadow_filter_func_vector	starpu_vector_filter_block_shadow
			
 
				+#define starpu_vector_list_filter_func		starpu_vector_filter_list
			
 
				+#define starpu_vector_divide_in_2_filter_func	starpu_vector_filter_divide_in_2
			
 
				+
			
 
				+#define starpu_block_filter_func_block			starpu_block_filter_block
			
 
				+#define starpu_block_shadow_filter_func_block		starpu_block_filter_block_shadow
			
 
				+#define starpu_vertical_block_filter_func_block		starpu_block_filter_vertical_block
			
 
				+#define starpu_vertical_block_shadow_filter_func_block	starpu_block_filter_vertical_block_shadow
			
 
				+#define starpu_depth_block_filter_func_block		starpu_block_filter_depth_block
			
 
				+#define starpu_depth_block_shadow_filter_func_block	starpu_block_filter_depth_block_shadow
			
 
				+
			
 
				+#endif /* STARPU_USE_DEPRECATED_ONE_ZERO_API */
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/include/starpu_driver.h
+++ b/include/starpu_driver.h
@@ -0,0 +1,67 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_DRIVER_H__
			
 
				+#define __STARPU_DRIVER_H__
			
 
				+
			
 
				+#include <starpu_config.h>
			
 
				+#if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
			
 
				+#include <starpu_opencl.h>
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+struct starpu_driver
			
 
				+{
			
 
				+	enum starpu_archtype type;
			
 
				+	union
			
 
				+	{
			
 
				+		unsigned cpu_id;
			
 
				+		unsigned cuda_id;
			
 
				+#if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
			
 
				+		cl_device_id opencl_id;
			
 
				+#elif defined(STARPU_SIMGRID)
			
 
				+		unsigned opencl_id;
			
 
				+#endif
			
 
				+		/*
			
 
				+		 * HOWTO: add a new kind of device to the starpu_driver structure.
			
 
				+		 * 1) Add a member to this union.
			
 
				+		 * 2) Edit _starpu_launch_drivers() to make sure the driver is
			
 
				+		 *    not always launched.
			
 
				+		 * 3) Edit starpu_driver_run() so that it can handle another
			
 
				+		 *    kind of architecture.
			
 
				+		 * 4) Write _starpu_run_foobar() in the corresponding driver.
			
 
				+		 * 5) Test the whole thing :)
			
 
				+		 */
			
 
				+	} id;
			
 
				+};
			
 
				+
			
 
				+int starpu_driver_run(struct starpu_driver *d);
			
 
				+void starpu_drivers_request_termination(void);
			
 
				+
			
 
				+int starpu_driver_init(struct starpu_driver *d);
			
 
				+int starpu_driver_run_once(struct starpu_driver *d);
			
 
				+int starpu_driver_deinit(struct starpu_driver *d);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __STARPU_DRIVER_H__ */
			
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -183,6 +183,8 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node);
 
				 
			
 
				 /* Return the current date in us */
			
 
				 double starpu_timing_now(void);
			
 
				+/* Returns the perfmodel footprint for the task */
			
 
				+uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
			
 
				 /* Returns expected task duration in us */
			
 
				 double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
			
 
				 /* Returns an estimated speedup factor relative to CPU speed */
			
--- a/include/starpu_stdlib.h
+++ b/include/starpu_stdlib.h
@@ -0,0 +1,41 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2013  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_STDLIB_H__
			
 
				+#define __STARPU_STDLIB_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+void starpu_malloc_set_align(size_t align);
			
 
				+int starpu_malloc(void **A, size_t dim);
			
 
				+int starpu_free(void *A);
			
 
				+
			
 
				+/* Allocate SIZE bytes on node NODE */
			
 
				+uintptr_t starpu_malloc_on_node(unsigned dst_node, size_t size);
			
 
				+/* Free ADDR on node NODE */
			
 
				+void starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __STARPU_STDLIB_H__ */
			
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -50,7 +50,7 @@ extern "C"
 
				 #  define STARPU_ATTRIBUTE_INTERNAL
			
 
				 #endif
			
 
				 
			
 
				-#if STARPU_GNUC_PREREQ(3, 1) && !defined(BUILDING_STARPU) && !defined(STARPU_USE_DEPRECATED_API)
			
 
				+#if STARPU_GNUC_PREREQ(3, 1) && !defined(BUILDING_STARPU) && !defined(STARPU_USE_DEPRECATED_API) && !defined(STARPU_USE_DEPRECATED_ONE_ZERO_API)
			
 
				 #define STARPU_DEPRECATED  __attribute__((__deprecated__))
			
 
				 #else
			
 
				 #define STARPU_DEPRECATED
			
--- a/mpi/examples/complex/mpi_complex.c
+++ b/mpi/examples/complex/mpi_complex.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -37,6 +37,10 @@ int main(int argc, char **argv)
 
				 	int ret;
			
 
				 	int compare;
			
 
				 
			
 
				+	starpu_data_handle_t handle;
			
 
				+	starpu_data_handle_t handle2;
			
 
				+	starpu_data_handle_t foo_handle;
			
 
				+
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 	ret = starpu_mpi_init(&argc, &argv, 1);
			
@@ -55,11 +59,9 @@ int main(int argc, char **argv)
 
				 		{
			
 
				 			double real[2] = {4.0, 2.0};
			
 
				 			double imaginary[2] = {7.0, 9.0};
			
 
				-			starpu_data_handle_t handle;
			
 
				 
			
 
				 			double real2[2] = {14.0, 12.0};
			
 
				 			double imaginary2[2] = {17.0, 19.0};
			
 
				-			starpu_data_handle_t handle2;
			
 
				 
			
 
				 			int *compare_ptr = &compare;
			
 
				 
			
@@ -76,7 +78,6 @@ int main(int argc, char **argv)
 
				 			{
			
 
				 				// We send a dummy variable only to check communication with predefined datatypes
			
 
				 				int foo=12;
			
 
				-				starpu_data_handle_t foo_handle;
			
 
				 				starpu_variable_data_register(&foo_handle, 0, (uintptr_t)&foo, sizeof(foo));
			
 
				 				starpu_mpi_isend_detached(foo_handle, 1, 40, MPI_COMM_WORLD, NULL, NULL);
			
 
				 				starpu_insert_task(&foo_display, STARPU_R, foo_handle, 0);
			
@@ -86,7 +87,6 @@ int main(int argc, char **argv)
 
				 		{
			
 
				 			double real[2] = {0.0, 0.0};
			
 
				 			double imaginary[2] = {0.0, 0.0};
			
 
				-			starpu_data_handle_t handle;
			
 
				 
			
 
				 			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
			
 
				 			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
			
@@ -96,7 +96,6 @@ int main(int argc, char **argv)
 
				 			{
			
 
				 				// We send a dummy variable only to check communication with predefined datatypes
			
 
				 				int foo=12;
			
 
				-				starpu_data_handle_t foo_handle;
			
 
				 				starpu_variable_data_register(&foo_handle, -1, (uintptr_t)NULL, sizeof(foo));
			
 
				 				starpu_mpi_irecv_detached(foo_handle, 0, 40, MPI_COMM_WORLD, NULL, NULL);
			
 
				 				starpu_insert_task(&foo_display, STARPU_R, foo_handle, 0);
			
@@ -104,7 +103,19 @@ int main(int argc, char **argv)
 
				 
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				 	starpu_task_wait_for_all();
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		starpu_data_unregister(handle2);
			
 
				+	}
			
 
				+	if (rank == 0 || rank == 1)
			
 
				+	{
			
 
				+		starpu_data_unregister(handle);
			
 
				+		starpu_data_unregister(foo_handle);
			
 
				+	}
			
 
				+
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky.c
@@ -41,23 +41,23 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	parse_args(argc, argv, nodes);
			
 
				 
			
 
				 	matrix_init(&bmat, rank, nodes, 1);
			
 
				 	matrix_display(bmat, rank);
			
 
				 
			
 
				-	dw_cholesky(bmat, size, size/nblocks, nblocks, rank, nodes, &timing, &flops);
			
 
				+	dw_cholesky(bmat, size/nblocks, rank, nodes, &timing, &flops);
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 
			
 
				 	matrix_display(bmat, rank);
			
 
				 
			
 
				-	dw_cholesky_check_computation(bmat, size, rank, nodes, &correctness, &flops);
			
 
				+	dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops);
			
 
				 
			
 
				 	matrix_free(&bmat, rank, nodes, 1);
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	assert(correctness);
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -29,7 +29,6 @@
 
				 
			
 
				 static struct starpu_codelet cl11 =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
			
@@ -41,7 +40,6 @@ static struct starpu_codelet cl11 =
 
				 
			
 
				 static struct starpu_codelet cl21 =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
			
@@ -53,7 +51,6 @@ static struct starpu_codelet cl21 =
 
				 
			
 
				 static struct starpu_codelet cl22 =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
			
@@ -67,7 +64,7 @@ static struct starpu_codelet cl22 =
 
				  *	code to bootstrap the factorization
			
 
				  *	and construct the DAG
			
 
				  */
			
 
				-void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, int rank, int nodes, double *timing, double *flops)
			
 
				+void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops)
			
 
				 {
			
 
				 	struct timeval start;
			
 
				 	struct timeval end;
			
@@ -169,7 +166,7 @@ void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, in
 
				 	}
			
 
				 }
			
 
				 
			
 
				-void dw_cholesky_check_computation(float ***matA, unsigned size, int rank, int nodes, int *correctness, double *flops)
			
 
				+void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops)
			
 
				 {
			
 
				 	unsigned i,j,x,y;
			
 
				 	float *rmat = malloc(size*size*sizeof(float));
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.h
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.h
@@ -23,8 +23,8 @@
 
				  *	code to bootstrap the factorization
			
 
				  *	and construct the DAG
			
 
				  */
			
 
				-void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, int rank, int nodes, double *timing, double *flops);
			
 
				+void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops);
			
 
				 
			
 
				-void dw_cholesky_check_computation(float ***matA, unsigned size, int rank, int nodes, int *correctness, double *flops);
			
 
				+void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops);
			
 
				 
			
 
				 #endif /* __MPI_CHOLESKY_CODELETS_H__ */
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c
@@ -40,18 +40,18 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	parse_args(argc, argv, nodes);
			
 
				 
			
 
				 	matrix_init(&bmat, rank, nodes, 0);
			
 
				 
			
 
				-	dw_cholesky(bmat, size, size/nblocks, nblocks, rank, nodes, &timing, &flops);
			
 
				+	dw_cholesky(bmat, size/nblocks, rank, nodes, &timing, &flops);
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 
			
 
				 	matrix_free(&bmat, rank, nodes, 0);
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	if (rank == 0)
			
--- a/mpi/examples/mpi_lu/plu_example.c
+++ b/mpi/examples/mpi_lu/plu_example.c
@@ -117,16 +117,16 @@ unsigned STARPU_PLU(display_flag)(void)
 
				 	return display;
			
 
				 }
			
 
				 
			
 
				-static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nblocks)
			
 
				+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
			
 
				 {
			
 
				-	const unsigned block_size = (size/nblocks);
			
 
				+	const unsigned block_size = (psize/pnblocks);
			
 
				 
			
 
				 	unsigned i, j;
			
 
				 	for (i = 0; i < block_size; i++)
			
 
				-	for (j = 0; j < block_size; j++)
			
 
				-	{
			
 
				-		blockptr[j+i*block_size] = (TYPE)starpu_drand48();
			
 
				-	}
			
 
				+	     for (j = 0; j < block_size; j++)
			
 
				+	     {
			
 
				+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
			
 
				+	     }
			
 
				 }
			
 
				 
			
 
				 #ifdef SINGLE_TMP11
			
@@ -163,15 +163,15 @@ starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static unsigned tmp_11_block_is_needed(int rank, unsigned nblocks, unsigned k)
			
 
				+static unsigned tmp_11_block_is_needed(int rank, unsigned pnblocks, unsigned k)
			
 
				 {
			
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static unsigned tmp_12_block_is_needed(int rank, unsigned nblocks, unsigned j)
			
 
				+static unsigned tmp_12_block_is_needed(int rank, unsigned pnblocks, unsigned j)
			
 
				 {
			
 
				 	unsigned i;
			
 
				-	for (i = 1; i < nblocks; i++)
			
 
				+	for (i = 1; i < pnblocks; i++)
			
 
				 	{
			
 
				 		if (get_block_rank(i, j) == rank)
			
 
				 			return 1;
			
@@ -180,10 +180,10 @@ static unsigned tmp_12_block_is_needed(int rank, unsigned nblocks, unsigned j)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static unsigned tmp_21_block_is_needed(int rank, unsigned nblocks, unsigned i)
			
 
				+static unsigned tmp_21_block_is_needed(int rank, unsigned pnblocks, unsigned i)
			
 
				 {
			
 
				 	unsigned j;
			
 
				-	for (j = 1; j < nblocks; j++)
			
 
				+	for (j = 1; j < pnblocks; j++)
			
 
				 	{
			
 
				 		if (get_block_rank(i, j) == rank)
			
 
				 			return 1;
			
@@ -373,7 +373,7 @@ starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
 
				 	return dataA_handles[j+i*nblocks];
			
 
				 }
			
 
				 
			
 
				-static void display_grid(int rank, unsigned nblocks)
			
 
				+static void display_grid(int rank, unsigned pnblocks)
			
 
				 {
			
 
				 	if (!display)
			
 
				 		return;
			
@@ -383,9 +383,9 @@ static void display_grid(int rank, unsigned nblocks)
 
				 		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
			
 
				 
			
 
				 		unsigned i, j;
			
 
				-		for (j = 0; j < nblocks; j++)
			
 
				+		for (j = 0; j < pnblocks; j++)
			
 
				 		{
			
 
				-			for (i = 0; i < nblocks; i++)
			
 
				+			for (i = 0; i < pnblocks; i++)
			
 
				 			{
			
 
				 				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
			
 
				 				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
			
@@ -432,7 +432,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	STARPU_ASSERT(p*q == world_size);
			
 
				 
			
 
				-	starpu_helper_cublas_init();
			
 
				+	starpu_cublas_init();
			
 
				 
			
 
				 	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
@@ -563,7 +563,7 @@ int main(int argc, char **argv)
 
				 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
 
				 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				 
			
 
				-	starpu_helper_cublas_shutdown();
			
 
				+	starpu_cublas_shutdown();
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/examples/stencil/stencil5.c
+++ b/mpi/examples/stencil/stencil5.c
@@ -31,7 +31,6 @@ void stencil5_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
				 
			
 
				 struct starpu_codelet stencil5_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {stencil5_cpu, NULL},
			
 
				 	.nbuffers = 5,
			
 
				 	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -175,6 +175,7 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 
				 		starpu_data_handle_t size_handle;
			
 
				 		starpu_variable_data_register(&size_handle, 0, (uintptr_t)&(size), sizeof(size));
			
 
				 		starpu_mpi_send(size_handle, dest, mpi_tag, comm);
			
 
				+		starpu_data_unregister(size_handle);
			
 
				 	}
			
 
				 
			
 
				 	return _starpu_mpi_isend_irecv_common(data_handle, size, dest, mpi_tag, comm, detached, callback, arg, SEND_REQ, _starpu_mpi_isend_pack_func, STARPU_R);
			
@@ -590,7 +591,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
				 
			
 
				 	ret = barrier_req->ret;
			
 
				 
			
 
				-	//free(waiting_req);
			
 
				+	free(barrier_req);
			
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 	return ret;
			
 
				 }
			
--- a/mpi/src/starpu_mpi_insert_task.c
+++ b/mpi/src/starpu_mpi_insert_task.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2011-2012  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2011-2013  Université de Bordeaux 1
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -459,6 +459,19 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
				 		{
			
 
				 			va_arg(varg_list, int);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_HYPERVISOR_TAG)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list, int);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_FLOPS)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list, double);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_TAG)
			
 
				+		{
			
 
				+			STARPU_ASSERT_MSG(0, "STARPU_TAG is not supported in MPI mode\n");
			
 
				+		}
			
 
				+
			
 
				 	}
			
 
				 	va_end(varg_list);
			
 
				 
			
@@ -559,6 +572,18 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
				 		{
			
 
				 			va_arg(varg_list, starpu_data_handle_t);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_HYPERVISOR_TAG)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list, int);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_FLOPS)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list, double);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_TAG)
			
 
				+		{
			
 
				+			STARPU_ASSERT_MSG(0, "STARPU_TAG is not supported in MPI mode\n");
			
 
				+		}
			
 
				 	}
			
 
				 	va_end(varg_list);
			
 
				 
			
@@ -628,7 +653,19 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
				 			{
			
 
				 				va_arg(varg_list, starpu_data_handle_t);
			
 
				 			}
			
 
				-		}
			
 
				+			else if (arg_type==STARPU_HYPERVISOR_TAG)
			
 
				+			{
			
 
				+				(void)va_arg(varg_list, int);
			
 
				+			}
			
 
				+			else if (arg_type==STARPU_FLOPS)
			
 
				+			{
			
 
				+				(void)va_arg(varg_list, double);
			
 
				+			}
			
 
				+			else if (arg_type==STARPU_TAG)
			
 
				+			{
			
 
				+				STARPU_ASSERT_MSG(0, "STARPU_TAG is not supported in MPI mode\n");
			
 
				+			}
			
 
				+			}
			
 
				 		va_end(varg_list);
			
 
				 	}
			
 
				 
			
@@ -686,6 +723,18 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 
				 		{
			
 
				 			va_arg(varg_list, starpu_data_handle_t);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_HYPERVISOR_TAG)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list, int);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_FLOPS)
			
 
				+		{
			
 
				+			(void)va_arg(varg_list, double);
			
 
				+		}
			
 
				+		else if (arg_type==STARPU_TAG)
			
 
				+		{
			
 
				+			STARPU_ASSERT_MSG(0, "STARPU_TAG is not supported in MPI mode\n");
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	va_end(varg_list);
			
--- a/mpi/tests/insert_task.c
+++ b/mpi/tests/insert_task.c
@@ -29,7 +29,6 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
				 
			
 
				 struct starpu_codelet mycodelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {func_cpu, NULL},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_RW, STARPU_R}
			
--- a/mpi/tests/insert_task_block.c
+++ b/mpi/tests/insert_task_block.c
@@ -46,7 +46,6 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
				 
			
 
				 struct starpu_codelet mycodelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {func_cpu, NULL},
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_RW}
			
--- a/mpi/tests/insert_task_cache.c
+++ b/mpi/tests/insert_task_cache.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -33,7 +33,6 @@ void func_cpu(__attribute__ ((unused)) void *descr[], __attribute__ ((unused)) v
 
				 
			
 
				 struct starpu_codelet mycodelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {func_cpu, NULL},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_RW, STARPU_R}
			
@@ -53,9 +52,8 @@ void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 
				 	int ret;
			
 
				 	unsigned v[2][N];
			
 
				 	starpu_data_handle_t data_handles[2];
			
 
				-	char *string;
			
 
				+	char string[50];
			
 
				 
			
 
				-	string = malloc(50);
			
 
				 	sprintf(string, "STARPU_MPI_CACHE=%d", enabled);
			
 
				 	putenv(string);
			
 
				 
			
@@ -104,7 +102,6 @@ void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 
				 	starpu_mpi_comm_amounts_retrieve(comm_amount);
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				-	free(string);
			
 
				 }
			
 
				 
			
 
				 int main(int argc, char **argv)
			
--- a/mpi/tests/insert_task_owner.c
+++ b/mpi/tests/insert_task_owner.c
@@ -32,7 +32,6 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
				 
			
 
				 struct starpu_codelet mycodelet_r_w =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {func_cpu, NULL},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_W}
			
@@ -40,7 +39,6 @@ struct starpu_codelet mycodelet_r_w =
 
				 
			
 
				 struct starpu_codelet mycodelet_rw_r =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {func_cpu, NULL},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_RW, STARPU_R}
			
@@ -48,7 +46,6 @@ struct starpu_codelet mycodelet_rw_r =
 
				 
			
 
				 struct starpu_codelet mycodelet_rw_rw =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {func_cpu, NULL},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_RW, STARPU_RW}
			
@@ -56,7 +53,6 @@ struct starpu_codelet mycodelet_rw_rw =
 
				 
			
 
				 struct starpu_codelet mycodelet_w_r =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {func_cpu, NULL},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_W, STARPU_R}
			
@@ -64,7 +60,6 @@ struct starpu_codelet mycodelet_w_r =
 
				 
			
 
				 struct starpu_codelet mycodelet_r_r =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {func_cpu, NULL},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_R}
			
--- a/mpi/tests/insert_task_owner2.c
+++ b/mpi/tests/insert_task_owner2.c
@@ -39,7 +39,6 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
				 
			
 
				 struct starpu_codelet mycodelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {func_cpu, NULL},
			
 
				 	.nbuffers = 4,
			
 
				 	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_W}
			
--- a/mpi/tests/insert_task_owner_data.c
+++ b/mpi/tests/insert_task_owner_data.c
@@ -29,7 +29,6 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
				 
			
 
				 struct starpu_codelet mycodelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {func_cpu, NULL},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_RW, STARPU_RW}
			
--- a/mpi/tests/mpi_detached_tag.c
+++ b/mpi/tests/mpi_detached_tag.c
@@ -74,6 +74,7 @@ int main(int argc, char **argv)
 
				 		starpu_tag_wait(tag);
			
 
				 	}
			
 
				 
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/tests/mpi_irecv.c
+++ b/mpi/tests/mpi_irecv.c
@@ -73,6 +73,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/tests/mpi_irecv_detached.c
+++ b/mpi/tests/mpi_irecv_detached.c
@@ -91,6 +91,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/tests/mpi_isend.c
+++ b/mpi/tests/mpi_isend.c
@@ -74,6 +74,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/tests/mpi_isend_detached.c
+++ b/mpi/tests/mpi_isend_detached.c
@@ -96,6 +96,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/tests/mpi_probe.c
+++ b/mpi/tests/mpi_probe.c
@@ -91,6 +91,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	starpu_data_unregister(tab_handle);
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
--- a/mpi/tests/mpi_reduction.c
+++ b/mpi/tests/mpi_reduction.c
@@ -24,7 +24,6 @@ extern void display_cpu_func(void *descr[], void *cl_arg);
 
				 
			
 
				 static struct starpu_codelet init_codelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {init_cpu_func, NULL},
			
 
				 	.nbuffers = 1,
			
 
				 	.name = "init_codelet"
			
@@ -32,7 +31,6 @@ static struct starpu_codelet init_codelet =
 
				 
			
 
				 static struct starpu_codelet redux_codelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {redux_cpu_func, NULL},
			
 
				 	.nbuffers = 2,
			
 
				 	.name = "redux_codelet"
			
@@ -40,7 +38,6 @@ static struct starpu_codelet redux_codelet =
 
				 
			
 
				 static struct starpu_codelet dot_codelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {dot_cpu_func, NULL},
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_REDUX},
			
@@ -49,7 +46,6 @@ static struct starpu_codelet dot_codelet =
 
				 
			
 
				 static struct starpu_codelet display_codelet =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {display_cpu_func, NULL},
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_R},
			
--- a/mpi/tests/mpi_scatter_gather.c
+++ b/mpi/tests/mpi_scatter_gather.c
@@ -48,7 +48,6 @@ void cpu_codelet(void *descr[], void *_args)
 
				 
			
 
				 static struct starpu_codelet cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU,
			
 
				 	.cpu_funcs = {cpu_codelet, NULL},
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_RW},
			
--- a/mpi/tests/ring.c
+++ b/mpi/tests/ring.c
@@ -39,7 +39,6 @@ void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
				 
			
 
				 static struct starpu_codelet increment_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {increment_cuda, NULL},
			
 
				 #endif
			
--- a/mpi/tests/ring_async.c
+++ b/mpi/tests/ring_async.c
@@ -39,7 +39,6 @@ void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
				 
			
 
				 static struct starpu_codelet increment_cl =
			
 
				 {
			
 
				-	.where = STARPU_CPU|STARPU_CUDA,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {increment_cuda, NULL},
			
 
				 #endif
			
--- a/mpi/tests/ring_async_implicit.c
+++ b/mpi/tests/ring_async_implicit.c