Browse Source

Merge from trunk @ 8824:8867

Marc Sergent 13 years ago
parent
commit
cc52f5a009
100 changed files with 401 additions and 393 deletions
  1. 4 0
      doc/chapters/advanced-api.texi
  2. 9 6
      doc/chapters/advanced-examples.texi
  3. 22 22
      doc/chapters/basic-api.texi
  4. 2 2
      doc/chapters/tips-tricks.texi
  5. 0 1
      doc/tutorial/hello_world.c
  6. 0 1
      doc/tutorial/vector_scal.c
  7. 3 4
      examples/audio/starpu_audio_processing.c
  8. 2 2
      examples/axpy/axpy.c
  9. 0 1
      examples/basic_examples/hello_world.c
  10. 2 3
      examples/basic_examples/mult.c
  11. 0 3
      examples/basic_examples/multiformat.c
  12. 0 4
      examples/basic_examples/multiformat_conversion_codelets.c
  13. 0 1
      examples/basic_examples/variable.c
  14. 0 1
      examples/basic_examples/vector_scal_c.c
  15. 0 1
      examples/callback/callback.c
  16. 5 5
      examples/cg/cg.c
  17. 0 9
      examples/cg/cg_kernels.c
  18. 4 7
      examples/cholesky/cholesky_grain_tag.c
  19. 4 7
      examples/cholesky/cholesky_implicit.c
  20. 4 7
      examples/cholesky/cholesky_tag.c
  21. 2 5
      examples/cholesky/cholesky_tile_tag.c
  22. 0 4
      examples/filters/custom_mf/custom_conversion_codelets.c
  23. 8 8
      examples/filters/custom_mf/custom_interface.c
  24. 0 3
      examples/filters/custom_mf/custom_mf_filter.c
  25. 1 2
      examples/filters/fblock.c
  26. 1 2
      examples/filters/fmatrix.c
  27. 1 2
      examples/filters/fvector.c
  28. 2 7
      examples/filters/shadow.c
  29. 5 10
      examples/filters/shadow2d.c
  30. 7 12
      examples/filters/shadow3d.c
  31. 0 1
      examples/gl_interop/gl_interop.c
  32. 0 1
      examples/gl_interop/gl_interop_idle.c
  33. 3 7
      examples/heat/dw_factolu.c
  34. 2 6
      examples/heat/dw_factolu_grain.c
  35. 2 6
      examples/heat/dw_factolu_tag.c
  36. 1 1
      examples/heat/dw_sparse_cg.c
  37. 1 1
      examples/heat/heat.c
  38. 0 1
      examples/incrementer/incrementer.c
  39. 5 5
      examples/interface/complex_interface.c
  40. 2 2
      examples/lu/lu_example.c
  41. 2 2
      examples/lu/xlu.c
  42. 2 2
      examples/lu/xlu_implicit.c
  43. 2 2
      examples/lu/xlu_implicit_pivot.c
  44. 2 2
      examples/lu/xlu_pivot.c
  45. 0 2
      examples/mandelbrot/mandelbrot.c
  46. 0 1
      examples/matvecmult/matvecmult.c
  47. 4 5
      examples/mult/xgemm.c
  48. 0 1
      examples/openmp/vector_scal.c
  49. 1 2
      examples/pi/pi.c
  50. 0 20
      examples/pi/pi_redux.c
  51. 2 13
      examples/pipeline/pipeline.c
  52. 2 3
      examples/ppm_downscaler/yuv_downscaler.c
  53. 0 1
      examples/profiling/profiling.c
  54. 2 2
      examples/reductions/dot_product.c
  55. 0 3
      examples/reductions/minmax_reduction.c
  56. 0 1
      examples/sched_ctx/sched_ctx.c
  57. 0 1
      examples/scheduler/dummy_sched.c
  58. 0 1
      examples/spmd/vector_scal_spmd.c
  59. 3 4
      examples/spmv/dw_block_spmv.c
  60. 1 2
      examples/spmv/spmv.c
  61. 0 24
      examples/stencil/stencil-kernels.c
  62. 0 1
      examples/stencil/stencil-tasks.c
  63. 0 1
      examples/top/hello_world_top.c
  64. 2 2
      gcc-plugin/examples/cholesky/cholesky.c
  65. 2 32
      include/starpu.h
  66. 2 2
      include/starpu_cublas.h
  67. 0 3
      include/starpu_data.h
  68. 16 16
      include/starpu_data_filters.h
  69. 0 5
      include/starpu_data_interfaces.h
  70. 35 1
      include/starpu_deprecated_api.h
  71. 67 0
      include/starpu_driver.h
  72. 3 1
      include/starpu_scheduler.h
  73. 41 0
      include/starpu_stdlib.h
  74. 1 1
      include/starpu_util.h
  75. 17 6
      mpi/examples/complex/mpi_complex.c
  76. 4 4
      mpi/examples/matrix_decomposition/mpi_cholesky.c
  77. 2 5
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  78. 2 2
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.h
  79. 3 3
      mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c
  80. 16 16
      mpi/examples/mpi_lu/plu_example.c
  81. 0 1
      mpi/examples/stencil/stencil5.c
  82. 2 1
      mpi/src/starpu_mpi.c
  83. 51 2
      mpi/src/starpu_mpi_insert_task.c
  84. 0 1
      mpi/tests/insert_task.c
  85. 0 1
      mpi/tests/insert_task_block.c
  86. 2 5
      mpi/tests/insert_task_cache.c
  87. 0 5
      mpi/tests/insert_task_owner.c
  88. 0 1
      mpi/tests/insert_task_owner2.c
  89. 0 1
      mpi/tests/insert_task_owner_data.c
  90. 1 0
      mpi/tests/mpi_detached_tag.c
  91. 1 0
      mpi/tests/mpi_irecv.c
  92. 1 0
      mpi/tests/mpi_irecv_detached.c
  93. 1 0
      mpi/tests/mpi_isend.c
  94. 1 0
      mpi/tests/mpi_isend_detached.c
  95. 1 0
      mpi/tests/mpi_probe.c
  96. 0 4
      mpi/tests/mpi_reduction.c
  97. 0 1
      mpi/tests/mpi_scatter_gather.c
  98. 0 1
      mpi/tests/ring.c
  99. 0 1
      mpi/tests/ring_async.c
  100. 0 0
      mpi/tests/ring_async_implicit.c

+ 4 - 0
doc/chapters/advanced-api.texi

@@ -1034,6 +1034,10 @@ Check if the worker specified by workerid can execute the codelet. Schedulers ne
 Return the current date in µs
 @end deftypefun
 
+@deftypefun uint32_t starpu_task_footprint ({struct starpu_perfmodel *}@var{model}, {struct starpu_task *} @var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
+Returns the footprint for a given task
+@end deftypefun
+
 @deftypefun double starpu_task_expected_length ({struct starpu_task *}@var{task}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl})
 Returns expected task duration in µs
 @end deftypefun

+ 9 - 6
doc/chapters/advanced-examples.texi

@@ -234,7 +234,7 @@ starpu_vector_data_register(&handle, 0, (uintptr_t)vector,
 /* Partition the vector in PARTS sub-vectors */
 starpu_data_filter f =
 @{
-    .filter_func = starpu_block_filter_func_vector,
+    .filter_func = starpu_vector_filter_block,
     .nchildren = PARTS
 @};
 starpu_data_partition(handle, &f);
@@ -430,11 +430,14 @@ a name which is different from the execution time performance model.
 
 The application can request time estimations from the StarPU performance
 models by filling a task structure as usual without actually submitting
-it. The data handles can be created by calling @code{starpu_data_register}
-functions with a @code{NULL} pointer (and need to be unregistered as usual)
-and the desired data sizes. The @code{starpu_task_expected_length} and
-@code{starpu_task_expected_power} functions can then be called to get an
-estimation of the task duration on a given arch. @code{starpu_task_destroy}
+it. The data handles can be created by calling @code{starpu_*_data_register}
+functions with a @code{NULL} pointer and @code{-1} node and the
+desired data sizes, and need to be unregistered as usual. The
+@code{starpu_task_expected_length} and @code{starpu_task_expected_power}
+functions can then be called to get an estimation of the task cost on a given
+arch. @code{starpu_task_footprint} can also be used to get the footprint used
+for indexing history-based performance models.
+@code{starpu_task_destroy}
 needs to be called to destroy the dummy task afterwards. See
 @code{tests/perfmodels/regression_based.c} for an example.
 

+ 22 - 22
doc/chapters/basic-api.texi

@@ -1221,12 +1221,12 @@ Return the size of the elements registered into the matrix designated by
 Applications can provide their own interface. An example is provided in
 @code{examples/interface}. A few helpers are provided.
 
-@deftypefun uintptr_t starpu_allocate_buffer_on_node (unsigned @var{dst_node}, size_t @var{size})
+@deftypefun uintptr_t starpu_malloc_on_node (unsigned @var{dst_node}, size_t @var{size})
 Allocate @var{size} bytes on node @var{dst_node}. This returns 0 if allocation
 failed, the allocation method should then return -ENOMEM as allocated size.
 @end deftypefun
 
-@deftypefun void starpu_free_buffer_on_node (unsigned @var{dst_node}, uintptr_t @var{addr}, size_t @var{size})
+@deftypefun void starpu_free_on_node (unsigned @var{dst_node}, uintptr_t @var{addr}, size_t @var{size})
 Free @var{addr} of @var{size} bytes on node @var{dst_node}.
 @end deftypefun
 
@@ -1280,7 +1280,7 @@ subdata according to the filter @var{f}, as shown in the following example:
 @cartouche
 @smallexample
 struct starpu_data_filter f = @{
-    .filter_func = starpu_block_filter_func,
+    .filter_func = starpu_matrix_filter_block,
     .nchildren = nslicesx,
     .get_nchildren = NULL,
     .get_child_ops = NULL
@@ -1359,13 +1359,13 @@ list can be found in @code{starpu_data_filters.h} .
 @node Partitioning Vector Data
 @subsubsection Partitioning Vector Data
 
-@deftypefun void starpu_block_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_vector_filter_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 Return in @code{*@var{child_interface}} the @var{id}th element of the
 vector represented by @var{father_interface} once partitioned in
 @var{nparts} chunks of equal size.
 @end deftypefun
 
-@deftypefun void starpu_block_shadow_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_vector_filter_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 Return in @code{*@var{child_interface}} the @var{id}th element of the
 vector represented by @var{father_interface} once partitioned in
 @var{nparts} chunks of equal size with a shadow border @code{filter_arg_ptr}, thus getting a vector of size (n-2*shadow)/nparts+2*shadow
@@ -1378,7 +1378,7 @@ enforced for the shadowed parts.
 A usage example is available in examples/filters/shadow.c
 @end deftypefun
 
-@deftypefun void starpu_vector_list_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_vector_filter_list (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 Return in @code{*@var{child_interface}} the @var{id}th element of the
 vector represented by @var{father_interface} once partitioned into
 @var{nparts} chunks according to the @code{filter_arg_ptr} field of
@@ -1389,7 +1389,7 @@ The @code{filter_arg_ptr} field must point to an array of @var{nparts}
 in each chunk of the partition.
 @end deftypefun
 
-@deftypefun void starpu_vector_divide_in_2_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_vector_filter_divide_in_2 (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 Return in @code{*@var{child_interface}} the @var{id}th element of the
 vector represented by @var{father_interface} once partitioned in two
 chunks of equal size, ignoring @var{nparts}.  Thus, @var{id} must be
@@ -1400,13 +1400,13 @@ chunks of equal size, ignoring @var{nparts}.  Thus, @var{id} must be
 @node Partitioning Matrix Data
 @subsubsection Partitioning Matrix Data
 
-@deftypefun void starpu_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_matrix_filter_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a dense Matrix along the x dimension, thus getting (x/nparts,y)
 matrices. If nparts does not divide x, the last submatrix contains the
 remainder.
 @end deftypefun
 
-@deftypefun void starpu_block_shadow_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_matrix_filter_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a dense Matrix along the x dimension, with a shadow border
 @code{filter_arg_ptr}, thus getting ((x-2*shadow)/nparts+2*shadow,y)
 matrices. If nparts does not divide x-2*shadow, the last submatrix contains the
@@ -1418,13 +1418,13 @@ enforced for the shadowed parts.
 A usage example is available in examples/filters/shadow2d.c
 @end deftypefun
 
-@deftypefun void starpu_vertical_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_matrix_filter_vertical_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a dense Matrix along the y dimension, thus getting (x,y/nparts)
 matrices. If nparts does not divide y, the last submatrix contains the
 remainder.
 @end deftypefun
 
-@deftypefun void starpu_vertical_block_shadow_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_matrix_filter_vertical_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a dense Matrix along the y dimension, with a shadow border
 @code{filter_arg_ptr}, thus getting (x,(y-2*shadow)/nparts+2*shadow)
 matrices. If nparts does not divide y-2*shadow, the last submatrix contains the
@@ -1441,13 +1441,13 @@ A usage example is available in examples/filters/shadow2d.c
 
 A usage example is available in examples/filters/shadow3d.c
 
-@deftypefun void starpu_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_block_filter_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a 3D matrix along the X dimension, thus getting (x/nparts,y,z)
 3D matrices. If nparts does not divide x, the last submatrix contains the
 remainder.
 @end deftypefun
 
-@deftypefun void starpu_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_block_filter_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a 3D matrix along the X dimension, with a shadow border
 @code{filter_arg_ptr}, thus getting ((x-2*shadow)/nparts+2*shadow,y,z) 3D
 matrices. If nparts does not divide x, the last submatrix contains the
@@ -1457,13 +1457,13 @@ IMPORTANT: This can only be used for read-only access, as no coherency is
 enforced for the shadowed parts.
 @end deftypefun
 
-@deftypefun void starpu_vertical_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_block_filter_vertical_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a 3D matrix along the Y dimension, thus getting (x,y/nparts,z)
 3D matrices. If nparts does not divide y, the last submatrix contains the
 remainder.
 @end deftypefun
 
-@deftypefun void starpu_vertical_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_block_filter_vertical_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a 3D matrix along the Y dimension, with a shadow border
 @code{filter_arg_ptr}, thus getting (x,(y-2*shadow)/nparts+2*shadow,z) 3D
 matrices. If nparts does not divide y, the last submatrix contains the
@@ -1473,13 +1473,13 @@ IMPORTANT: This can only be used for read-only access, as no coherency is
 enforced for the shadowed parts.
 @end deftypefun
 
-@deftypefun void starpu_depth_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_block_filter_depth_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a 3D matrix along the Z dimension, thus getting (x,y,z/nparts)
 3D matrices. If nparts does not divide z, the last submatrix contains the
 remainder.
 @end deftypefun
 
-@deftypefun void starpu_depth_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_block_filter_depth_block_shadow (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a 3D matrix along the Z dimension, with a shadow border
 @code{filter_arg_ptr}, thus getting (x,y,(z-2*shadow)/nparts+2*shadow)
 3D matrices. If nparts does not divide z, the last submatrix contains the
@@ -1492,11 +1492,11 @@ enforced for the shadowed parts.
 @node Partitioning BCSR Data
 @subsubsection Partitioning BCSR Data
 
-@deftypefun void starpu_canonical_block_filter_bcsr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_bcsr_filter_canonical_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a block-sparse matrix into dense matrices.
 @end deftypefun
 
-@deftypefun void starpu_vertical_block_filter_func_csr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+@deftypefun void starpu_csr_filter_vertical_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 This partitions a block-sparse matrix into vertical block-sparse matrices.
 @end deftypefun
 
@@ -2533,15 +2533,15 @@ whether @code{devid} is among the @code{cuda_opengl_interoperability} field of
 the @code{starpu_conf} structure.
 @end deftypefun
 
-@deftypefun void starpu_helper_cublas_init (void)
+@deftypefun void starpu_cublas_init (void)
 This function initializes CUBLAS on every CUDA device.
 The CUBLAS library must be initialized prior to any CUBLAS call. Calling
-@code{starpu_helper_cublas_init} will initialize CUBLAS on every CUDA device
+@code{starpu_cublas_init} will initialize CUBLAS on every CUDA device
 controlled by StarPU. This call blocks until CUBLAS has been properly
 initialized on every device.
 @end deftypefun
 
-@deftypefun void starpu_helper_cublas_shutdown (void)
+@deftypefun void starpu_cublas_shutdown (void)
 This function synchronously deinitializes the CUBLAS library on every CUDA device.
 @end deftypefun
 

+ 2 - 2
doc/chapters/tips-tricks.texi

@@ -61,7 +61,7 @@ static void fft(void *descr[], void *_args)
 Another way to go which may be needed is to execute some code from the workers
 themselves thanks to @code{starpu_execute_on_each_worker}. This may be required
 by CUDA to behave properly due to threading issues. For instance, StarPU's
-@code{starpu_helper_cublas_init} looks like the following to call
+@code{starpu_cublas_init} looks like the following to call
 @code{cublasInit} from the workers themselves:
 
 @cartouche
@@ -71,7 +71,7 @@ static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
     cublasStatus cublasst = cublasInit();
     cublasSetKernelStream(starpu_cuda_get_local_stream());
 @}
-void starpu_helper_cublas_init(void)
+void starpu_cublas_init(void)
 @{
     starpu_execute_on_each_worker(init_cublas_func, NULL, STARPU_CUDA);
 @}

+ 0 - 1
doc/tutorial/hello_world.c

@@ -32,7 +32,6 @@ void cpu_func(void *buffers[], void *cl_arg)
 
 struct starpu_codelet cl =
 {
-    .where = STARPU_CPU,
     .cpu_funcs = {cpu_func, NULL},
     .nbuffers = 0
 };

+ 0 - 1
doc/tutorial/vector_scal.c

@@ -31,7 +31,6 @@ extern void scal_cuda_func(void *buffers[], void *_args);
 extern void scal_opencl_func(void *buffers[], void *_args);
 
 static struct starpu_codelet cl = {
-    .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
     /* CPU implementation of the codelet */
     .cpu_funcs = {scal_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA

+ 3 - 4
examples/audio/starpu_audio_processing.c

@@ -283,7 +283,6 @@ struct starpu_perfmodel band_filter_model =
 static struct starpu_codelet band_filter_cl =
 {
 	.modes = { STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {band_filter_kernel_gpu, NULL},
 #endif
@@ -413,13 +412,13 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	starpu_vector_data_register(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_block_filter_func_vector,
+		.filter_func = starpu_vector_filter_block,
 		.nchildren = niter
 	};
 
@@ -463,7 +462,7 @@ int main(int argc, char **argv)
 	starpu_data_unpartition(A_handle, 0);
 	starpu_data_unregister(A_handle);
 
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 
 	/* we are done ! */
 	starpu_shutdown();

+ 2 - 2
examples/axpy/axpy.c

@@ -128,7 +128,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
 #endif
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	/* This is equivalent to
 		vec_a = malloc(N*sizeof(TYPE));
@@ -157,7 +157,7 @@ int main(int argc, char **argv)
 	/* Divide the vector into blocks */
 	struct starpu_data_filter block_filter =
 	{
-		.filter_func = starpu_block_filter_func_vector,
+		.filter_func = starpu_vector_filter_block,
 		.nchildren = NBLOCKS
 	};
 

+ 0 - 1
examples/basic_examples/hello_world.c

@@ -82,7 +82,6 @@ int main(int argc, char **argv)
 
 	/* this codelet may only be executed on a CPU, and its cpu
  	 * implementation is function "cpu_func" */
-	cl.where = STARPU_CPU;
 	cl.cpu_funcs[0] = cpu_func;
 	/* the codelet does not manipulate any data that is managed
 	 * by our DSM */

+ 2 - 3
examples/basic_examples/mult.c

@@ -194,13 +194,13 @@ static void partition_mult_data(void)
 	 * name of the filters are a bit misleading */
 	struct starpu_data_filter vert =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nslicesx
 	};
 
 	struct starpu_data_filter horiz =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nslicesy
 	};
 
@@ -263,7 +263,6 @@ static struct starpu_perfmodel mult_perf_model =
 static struct starpu_codelet cl =
 {
         /* we can only execute that kernel on a CPU yet */
-        .where = STARPU_CPU,
         /* CPU implementation of the codelet */
         .cpu_funcs = {cpu_mult, NULL},
         /* the codelet manipulates 3 buffers that are managed by the

+ 0 - 3
examples/basic_examples/multiformat.c

@@ -79,7 +79,6 @@ extern void multiformat_scal_opencl_func(void *buffers[], void *arg);
 #ifdef STARPU_USE_CPU
 static struct starpu_codelet cpu_cl =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {multiformat_scal_cpu_func, NULL},
 	.nbuffers = 1,
 	.modes = { STARPU_RW },
@@ -90,7 +89,6 @@ static struct starpu_codelet cpu_cl =
 #ifdef STARPU_USE_CUDA
 static struct starpu_codelet cuda_cl =
 {
-	.where = STARPU_CUDA,
 	.cuda_funcs = { multiformat_scal_cuda_func, NULL },
 	.nbuffers = 1,
 	.modes = { STARPU_RW },
@@ -101,7 +99,6 @@ static struct starpu_codelet cuda_cl =
 #ifdef STARPU_USE_OPENCL
 static struct starpu_codelet opencl_cl =
 {
-	.where = STARPU_OPENCL,
 	.opencl_funcs = { multiformat_scal_opencl_func, NULL },
 	.nbuffers = 1,
 	.modes = { STARPU_RW },

+ 0 - 4
examples/basic_examples/multiformat_conversion_codelets.c

@@ -34,7 +34,6 @@ void cuda_to_cpu(void *buffers[], void *arg)
 extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
 struct starpu_codelet cpu_to_cuda_cl =
 {
-	.where = STARPU_CUDA,
 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
 	.nbuffers = 1,
 	.name = "codelet_cpu_to_cuda"
@@ -42,7 +41,6 @@ struct starpu_codelet cpu_to_cuda_cl =
 
 struct starpu_codelet cuda_to_cpu_cl =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {cuda_to_cpu, NULL},
 	.nbuffers = 1,
 	.name = "codelet_cude_to_cpu"
@@ -67,14 +65,12 @@ void opencl_to_cpu(void *buffers[], void *arg)
 extern void cpu_to_opencl_opencl_func(void *buffers[], void *args);
 struct starpu_codelet cpu_to_opencl_cl =
 {
-	.where = STARPU_OPENCL,
 	.opencl_funcs = {cpu_to_opencl_opencl_func, NULL},
 	.nbuffers = 1
 };
 
 struct starpu_codelet opencl_to_cpu_cl =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {opencl_to_cpu, NULL},
 	.nbuffers = 1
 };

+ 0 - 1
examples/basic_examples/variable.c

@@ -59,7 +59,6 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
 #endif
 
-	cl.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL;
         cl.cpu_funcs[0] = cpu_codelet;
 #ifdef STARPU_USE_CUDA
         cl.cuda_funcs[0] = cuda_codelet;

+ 0 - 1
examples/basic_examples/vector_scal_c.c

@@ -41,7 +41,6 @@ static struct starpu_perfmodel vector_scal_model =
 static struct starpu_codelet cl =
 {
 	.modes = { STARPU_RW },
-	.where = STARPU_CPU | STARPU_CUDA,
 	/* CPU implementation of the codelet */
 	.cpu_funcs = {scal_cpu_func, NULL},
 #ifdef STARPU_USE_CUDA

+ 0 - 1
examples/callback/callback.c

@@ -33,7 +33,6 @@ void cpu_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 struct starpu_codelet cl =
 {
 	.modes = { STARPU_RW },
-	.where = STARPU_CPU,
 	.cpu_funcs = {cpu_codelet, NULL},
 	.nbuffers = 1
 };

+ 5 - 5
examples/cg/cg.c

@@ -197,10 +197,10 @@ static void partition_data(void)
 	 */
 
 	/* Partition into contiguous parts */
-	matrix_filter_1.filter_func = starpu_block_filter_func;
+	matrix_filter_1.filter_func = starpu_matrix_filter_block;
 	matrix_filter_1.nchildren = nblocks;
 	/* Partition into non-contiguous parts */
-	matrix_filter_2.filter_func = starpu_vertical_block_filter_func;
+	matrix_filter_2.filter_func = starpu_matrix_filter_vertical_block;
 	matrix_filter_2.nchildren = nblocks;
 
 	/* A is in FORTRAN ordering, starpu_data_get_sub_data(A_handle, 2, i,
@@ -211,7 +211,7 @@ static void partition_data(void)
 	 *	Partition the vectors
 	 */
 
-	vector_filter.filter_func = starpu_block_filter_func_vector;
+	vector_filter.filter_func = starpu_vector_filter_block;
 	vector_filter.nchildren = nblocks;
 
 	starpu_data_partition(b_handle, &vector_filter);
@@ -417,7 +417,7 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	generate_random_problem();
 	register_data();
@@ -431,7 +431,7 @@ int main(int argc, char **argv)
 	starpu_task_wait_for_all();
 	unregister_data();
 	free_data();
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 	starpu_shutdown();
 
 	return ret;

+ 0 - 9
examples/cg/cg_kernels.c

@@ -94,7 +94,6 @@ static struct starpu_perfmodel accumulate_variable_model =
 struct starpu_codelet accumulate_variable_cl =
 {
 	.can_execute = can_execute,
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {accumulate_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {accumulate_variable_cuda, NULL},
@@ -133,7 +132,6 @@ static struct starpu_perfmodel accumulate_vector_model =
 struct starpu_codelet accumulate_vector_cl =
 {
 	.can_execute = can_execute,
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {accumulate_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {accumulate_vector_cuda, NULL},
@@ -174,7 +172,6 @@ static struct starpu_perfmodel bzero_variable_model =
 struct starpu_codelet bzero_variable_cl =
 {
 	.can_execute = can_execute,
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {bzero_variable_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {bzero_variable_cuda, NULL},
@@ -212,7 +209,6 @@ static struct starpu_perfmodel bzero_vector_model =
 struct starpu_codelet bzero_vector_cl =
 {
 	.can_execute = can_execute,
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {bzero_vector_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {bzero_vector_cuda, NULL},
@@ -268,7 +264,6 @@ static struct starpu_perfmodel dot_kernel_model =
 static struct starpu_codelet dot_kernel_cl =
 {
 	.can_execute = can_execute,
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dot_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dot_kernel_cuda, NULL},
@@ -348,7 +343,6 @@ static struct starpu_perfmodel scal_kernel_model =
 static struct starpu_codelet scal_kernel_cl =
 {
 	.can_execute = can_execute,
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {scal_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {scal_kernel_cuda, NULL},
@@ -422,7 +416,6 @@ static struct starpu_perfmodel gemv_kernel_model =
 static struct starpu_codelet gemv_kernel_cl =
 {
 	.can_execute = can_execute,
-	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SPMD,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {gemv_kernel_cpu, NULL},
@@ -522,7 +515,6 @@ static struct starpu_perfmodel scal_axpy_kernel_model =
 static struct starpu_codelet scal_axpy_kernel_cl =
 {
 	.can_execute = can_execute,
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {scal_axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {scal_axpy_kernel_cuda, NULL},
@@ -597,7 +589,6 @@ static struct starpu_perfmodel axpy_kernel_model =
 static struct starpu_codelet axpy_kernel_cl =
 {
 	.can_execute = can_execute,
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {axpy_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {axpy_kernel_cuda, NULL},

+ 4 - 7
examples/cholesky/cholesky_grain_tag.c

@@ -39,7 +39,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 static struct starpu_codelet cl11 =
 {
 	.modes = { STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
@@ -77,7 +76,6 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 static struct starpu_codelet cl21 =
 {
 	.modes = { STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
@@ -124,7 +122,6 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, un
 static struct starpu_codelet cl22 =
 {
 	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
@@ -198,13 +195,13 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 
@@ -295,7 +292,7 @@ static void initialize_system(float **A, unsigned dim, unsigned pinned)
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 #ifndef STARPU_SIMGRID
 	if (pinned)
@@ -341,7 +338,7 @@ static void shutdown_system(float **matA, unsigned pinned)
 	     free(*matA);
 	}
 
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 	starpu_shutdown();
 }
 

+ 4 - 7
examples/cholesky/cholesky_implicit.c

@@ -24,7 +24,6 @@
 
 static struct starpu_codelet cl11 =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SEQ,
 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
@@ -39,7 +38,6 @@ static struct starpu_codelet cl11 =
 
 static struct starpu_codelet cl21 =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SEQ,
 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
@@ -54,7 +52,6 @@ static struct starpu_codelet cl21 =
 
 static struct starpu_codelet cl22 =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SEQ,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
@@ -185,13 +182,13 @@ static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 
@@ -344,7 +341,7 @@ int main(int argc, char **argv)
                 return 77;
         STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	if(with_ctxs)
 	{
@@ -360,7 +357,7 @@ int main(int argc, char **argv)
 	else
 		execute_cholesky(size, nblocks);
 
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 	starpu_shutdown();
 
 	return ret;

+ 4 - 7
examples/cholesky/cholesky_tag.c

@@ -39,7 +39,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 static struct starpu_codelet cl11 =
 {
 	.modes = { STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
@@ -78,7 +77,6 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 static struct starpu_codelet cl21 =
 {
 	.modes = { STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
@@ -127,7 +125,6 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 static struct starpu_codelet cl22 =
 {
 	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
@@ -261,7 +258,7 @@ static int initialize_system(float **A, unsigned dim, unsigned pinned)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 #ifndef STARPU_SIMGRID
 	if (pinned)
@@ -288,13 +285,13 @@ static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 
@@ -316,7 +313,7 @@ static void shutdown_system(float **matA, unsigned pinned)
 		free(*matA);
 	}
 
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 	starpu_shutdown();
 }
 

+ 2 - 5
examples/cholesky/cholesky_tile_tag.c

@@ -42,7 +42,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 static struct starpu_codelet cl11 =
 {
 	.modes = { STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
@@ -80,7 +79,6 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 static struct starpu_codelet cl21 =
 {
 	.modes = { STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
@@ -127,7 +125,6 @@ static int create_task_21(unsigned k, unsigned j)
 static struct starpu_codelet cl22 =
 {
 	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
@@ -260,7 +257,7 @@ int main(int argc, char **argv)
 	/* Disable sequential consistency */
 	starpu_data_set_default_sequential_consistency_flag(0);
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 #ifndef STARPU_SIMGRID
 	for (y = 0; y < nblocks; y++)
@@ -321,7 +318,7 @@ int main(int argc, char **argv)
 		}
 	}
 
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 
 	starpu_shutdown();
 	return ret;

+ 0 - 4
examples/filters/custom_mf/custom_conversion_codelets.c

@@ -39,7 +39,6 @@ void cuda_to_cpu(void *buffers[], void *arg)
 extern void cpu_to_cuda_cuda_func(void *buffers[], void *args);
 struct starpu_codelet cpu_to_cuda_cl =
 {
-	.where = STARPU_CUDA,
 	.cuda_funcs = {cpu_to_cuda_cuda_func, NULL},
 	.modes = { STARPU_RW },
 	.nbuffers = 1,
@@ -48,7 +47,6 @@ struct starpu_codelet cpu_to_cuda_cl =
 
 struct starpu_codelet cuda_to_cpu_cl =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {cuda_to_cpu, NULL},
 	.modes = { STARPU_RW },
 	.nbuffers = 1,
@@ -77,7 +75,6 @@ extern void cpu_to_opencl_opencl_func(void *buffers[], void *arg);
 
 struct starpu_codelet cpu_to_opencl_cl =
 {
-	.where = STARPU_OPENCL,
 	.opencl_funcs = { cpu_to_opencl_opencl_func, NULL },
 	.modes = { STARPU_RW },
 	.nbuffers = 1,
@@ -86,7 +83,6 @@ struct starpu_codelet cpu_to_opencl_cl =
 
 struct starpu_codelet opencl_to_cpu_cl =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = { opencl_to_cpu_cpu_func, NULL },
 	.modes = { STARPU_RW },
 	.nbuffers = 1,

+ 8 - 8
examples/filters/custom_mf/custom_interface.c

@@ -150,16 +150,16 @@ static ssize_t allocate_custom_buffer_on_node(void *data_interface, unsigned nod
 	custom_interface = (struct custom_data_interface *) data_interface;
 
 	size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
-	custom_interface->cpu_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
+	custom_interface->cpu_ptr = (void*) starpu_malloc_on_node(node, size);
 	if (!custom_interface->cpu_ptr)
 		goto fail_cpu;
 #ifdef STARPU_USE_CUDA
-	custom_interface->cuda_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
+	custom_interface->cuda_ptr = (void*) starpu_malloc_on_node(node, size);
 	if (!custom_interface->cuda_ptr)
 		goto fail_cuda;
 #endif
 #ifdef STARPU_USE_OPENCL
-	custom_interface->opencl_ptr = (void*) starpu_allocate_buffer_on_node(node, size);
+	custom_interface->opencl_ptr = (void*) starpu_malloc_on_node(node, size);
 	if (!custom_interface->opencl_ptr)
 		goto fail_opencl;
 #endif
@@ -175,13 +175,13 @@ static ssize_t allocate_custom_buffer_on_node(void *data_interface, unsigned nod
 #ifdef STARPU_USE_OPENCL
 fail_opencl:
 #ifdef STARPU_USE_CUDA
-	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
+	starpu_free_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
 #endif
 #endif
 #ifdef STARPU_USE_CUDA
 fail_cuda:
 #endif
-	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
+	starpu_free_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
 fail_cpu:
 	return -ENOMEM;
 }
@@ -191,12 +191,12 @@ static void free_custom_buffer_on_node(void *data_interface, unsigned node)
 	struct custom_data_interface *custom_interface = (struct custom_data_interface *) data_interface;
 	size_t size = custom_interface->nx * custom_interface->ops->cpu_elemsize;
 
-	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
+	starpu_free_on_node(node, (uintptr_t) custom_interface->cpu_ptr, size);
 #ifdef STARPU_USE_CUDA
-	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
+	starpu_free_on_node(node, (uintptr_t) custom_interface->cuda_ptr, size);
 #endif
 #ifdef STARPU_USE_OPENCL
-	starpu_free_buffer_on_node(node, (uintptr_t) custom_interface->opencl_ptr, size);
+	starpu_free_on_node(node, (uintptr_t) custom_interface->opencl_ptr, size);
 #endif
 }
 

+ 0 - 3
examples/filters/custom_mf/custom_mf_filter.c

@@ -148,7 +148,6 @@ extern void custom_scal_cuda_func(void *buffers[], void *args);
 
 static struct starpu_codelet cpu_cl =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = { custom_scal_cpu_func, NULL},
 	.nbuffers = 1,
 	.modes = { STARPU_RW },
@@ -158,7 +157,6 @@ static struct starpu_codelet cpu_cl =
 #ifdef STARPU_USE_CUDA
 static struct starpu_codelet cuda_cl =
 {
-	.where = STARPU_CUDA,
 	.cuda_funcs = { custom_scal_cuda_func, NULL },
 	.nbuffers = 1,
 	.modes = { STARPU_RW },
@@ -171,7 +169,6 @@ extern void custom_scal_opencl_func(void *buffers[], void *args);
 
 static struct starpu_codelet opencl_cl =
 {
-	.where = STARPU_OPENCL,
 	.opencl_funcs = { custom_scal_opencl_func, NULL },
 	.nbuffers = 1,
 	.modes = { STARPU_RW },

+ 1 - 2
examples/filters/fblock.c

@@ -91,7 +91,6 @@ int main(int argc, char **argv)
 	starpu_data_handle_t handle;
 	struct starpu_codelet cl =
 	{
-                .where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
                 .cpu_funcs = {cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
@@ -121,7 +120,7 @@ int main(int argc, char **argv)
         /* Partition the block in PARTS sub-blocks */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_block_filter_func_block,
+		.filter_func = starpu_block_filter_block,
 		.nchildren = PARTS
 	};
         starpu_data_partition(handle, &f);

+ 1 - 2
examples/filters/fmatrix.c

@@ -62,7 +62,6 @@ int main(int argc, char **argv)
         starpu_data_handle_t handle;
         struct starpu_codelet cl =
 	{
-                .where = STARPU_CPU,
                 .cpu_funcs = {cpu_func, NULL},
                 .nbuffers = 1,
 		.modes = {STARPU_RW}
@@ -79,7 +78,7 @@ int main(int argc, char **argv)
         /* Partition the matrix in PARTS sub-matrices */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = PARTS
 	};
 	starpu_data_partition(handle, &f);

+ 1 - 2
examples/filters/fvector.c

@@ -45,7 +45,6 @@ int main(int argc, char **argv)
 
         struct starpu_codelet cl =
 	{
-                .where = STARPU_CPU,
                 .cpu_funcs = {cpu_func, NULL},
                 .nbuffers = 1,
 		.modes = {STARPU_RW}
@@ -67,7 +66,7 @@ int main(int argc, char **argv)
         /* Partition the vector in PARTS sub-vectors */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_block_filter_func_vector,
+		.filter_func = starpu_vector_filter_block,
 		.nchildren = PARTS
 	};
 	starpu_data_partition(handle, &f);

+ 2 - 7
examples/filters/shadow.c

@@ -99,11 +99,6 @@ int main(int argc, char **argv)
 
         struct starpu_codelet cl =
 	{
-                .where = STARPU_CPU
-#ifdef STARPU_USE_CUDA
-			|STARPU_CUDA
-#endif
-			,
                 .cpu_funcs = {cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
@@ -136,7 +131,7 @@ int main(int argc, char **argv)
 	 * combined. */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_block_shadow_filter_func_vector,
+		.filter_func = starpu_vector_filter_block_shadow,
 		.nchildren = PARTS,
 		.filter_arg_ptr = (void*)(uintptr_t) SHADOW /* Shadow width */
 	};
@@ -145,7 +140,7 @@ int main(int argc, char **argv)
         /* Partition the destination vector in PARTS sub-vectors */
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_block_filter_func_vector,
+		.filter_func = starpu_vector_filter_block,
 		.nchildren = PARTS,
 	};
 	starpu_data_partition(handle2, &f2);

+ 5 - 10
examples/filters/shadow2d.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -153,11 +153,6 @@ int main(int argc, char **argv)
 
         struct starpu_codelet cl =
 	{
-                .where = STARPU_CPU
-#ifdef STARPU_USE_CUDA
-			|STARPU_CUDA
-#endif
-			,
                 .cpu_funcs = {cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
@@ -217,13 +212,13 @@ int main(int argc, char **argv)
 	 * combined. */
 	struct starpu_data_filter fy =
 	{
-		.filter_func = starpu_vertical_block_shadow_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block_shadow,
 		.nchildren = PARTSY,
 		.filter_arg_ptr = (void*)(uintptr_t) SHADOWY /* Shadow width */
 	};
 	struct starpu_data_filter fx =
 	{
-		.filter_func = starpu_block_shadow_filter_func,
+		.filter_func = starpu_matrix_filter_block_shadow,
 		.nchildren = PARTSX,
 		.filter_arg_ptr = (void*)(uintptr_t) SHADOWX /* Shadow width */
 	};
@@ -232,12 +227,12 @@ int main(int argc, char **argv)
         /* Partition the destination matrix in PARTSY*PARTSX sub-matrices */
 	struct starpu_data_filter fy2 =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = PARTSY,
 	};
 	struct starpu_data_filter fx2 =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = PARTSX,
 	};
 	starpu_data_map_filters(handle2, 2, &fy2, &fx2);

+ 7 - 12
examples/filters/shadow3d.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -120,11 +120,6 @@ int main(int argc, char **argv)
 
         struct starpu_codelet cl =
 	{
-                .where = STARPU_CPU
-#ifdef STARPU_USE_CUDA
-			|STARPU_CUDA
-#endif
-			,
                 .cpu_funcs = {cpu_func, NULL},
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
@@ -235,19 +230,19 @@ int main(int argc, char **argv)
 	 * combined. */
 	struct starpu_data_filter fz =
 	{
-		.filter_func = starpu_depth_block_shadow_filter_func_block,
+		.filter_func = starpu_block_filter_depth_block_shadow,
 		.nchildren = PARTSZ,
 		.filter_arg_ptr = (void*)(uintptr_t) SHADOWZ /* Shadow width */
 	};
 	struct starpu_data_filter fy =
 	{
-		.filter_func = starpu_vertical_block_shadow_filter_func_block,
+		.filter_func = starpu_block_filter_vertical_block_shadow,
 		.nchildren = PARTSY,
 		.filter_arg_ptr = (void*)(uintptr_t) SHADOWY /* Shadow width */
 	};
 	struct starpu_data_filter fx =
 	{
-		.filter_func = starpu_block_shadow_filter_func_block,
+		.filter_func = starpu_block_filter_block_shadow,
 		.nchildren = PARTSX,
 		.filter_arg_ptr = (void*)(uintptr_t) SHADOWX /* Shadow width */
 	};
@@ -256,17 +251,17 @@ int main(int argc, char **argv)
         /* Partition the destination matrix in PARTSZ*PARTSY*PARTSX sub-matrices */
 	struct starpu_data_filter fz2 =
 	{
-		.filter_func = starpu_depth_block_filter_func_block,
+		.filter_func = starpu_block_filter_depth_block,
 		.nchildren = PARTSZ,
 	};
 	struct starpu_data_filter fy2 =
 	{
-		.filter_func = starpu_vertical_block_filter_func_block,
+		.filter_func = starpu_block_filter_vertical_block,
 		.nchildren = PARTSY,
 	};
 	struct starpu_data_filter fx2 =
 	{
-		.filter_func = starpu_block_filter_func_block,
+		.filter_func = starpu_block_filter_block,
 		.nchildren = PARTSX,
 	};
 	starpu_data_map_filters(handle2, 3, &fz2, &fy2, &fx2);

+ 0 - 1
examples/gl_interop/gl_interop.c

@@ -39,7 +39,6 @@ void dummy(void *buffers[], void *cl_arg)
 }
 
 struct starpu_codelet cl = {
-	.where = STARPU_CUDA,
 	.cuda_funcs = { dummy, NULL },
 	.nbuffers = 1,
 	.modes = { STARPU_W },

+ 0 - 1
examples/gl_interop/gl_interop_idle.c

@@ -42,7 +42,6 @@ void dummy(void *buffers[], void *cl_arg)
 }
 
 struct starpu_codelet cl = {
-	.where = STARPU_CUDA,
 	.cuda_funcs = { dummy, NULL },
 	.nbuffers = 1,
 	.modes = { STARPU_W },

+ 3 - 7
examples/heat/dw_factolu.c

@@ -36,7 +36,6 @@ static unsigned no_prio = 0;
 
 static struct starpu_codelet cl11 =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u11, NULL},
@@ -48,7 +47,6 @@ static struct starpu_codelet cl11 =
 
 static struct starpu_codelet cl12 =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u12, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u12, NULL},
@@ -60,7 +58,6 @@ static struct starpu_codelet cl12 =
 
 static struct starpu_codelet cl21 =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u21, NULL},
@@ -72,7 +69,6 @@ static struct starpu_codelet cl21 =
 
 static struct starpu_codelet cl22 =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u22, NULL},
@@ -705,7 +701,7 @@ void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	if (pinned)
 	{
@@ -759,13 +755,13 @@ void dw_factoLU(float *matA, unsigned size,
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 

+ 2 - 6
examples/heat/dw_factolu_grain.c

@@ -45,7 +45,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 static struct starpu_codelet cl11 =
 {
 	.modes = { STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u11, NULL},
@@ -80,7 +79,6 @@ static struct starpu_task *create_task_11(starpu_data_handle_t dataA, unsigned k
 static struct starpu_codelet cl12 =
 {
 	.modes = { STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u12, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u12, NULL},
@@ -125,7 +123,6 @@ static void create_task_12(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 static struct starpu_codelet cl21 =
 {
 	.modes = { STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u21, NULL},
@@ -167,7 +164,6 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, u
 static struct starpu_codelet cl22 =
 {
 	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u22, NULL},
@@ -227,13 +223,13 @@ static void dw_factoLU_grain_inner(float *matA, unsigned size, unsigned inner_si
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 

+ 2 - 6
examples/heat/dw_factolu_tag.c

@@ -47,7 +47,6 @@ static struct starpu_task *create_task(starpu_tag_t id)
 static struct starpu_codelet cl11 =
 {
 	.modes = { STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u11, NULL},
@@ -83,7 +82,6 @@ static struct starpu_task *create_task_11(starpu_data_handle_t dataA, unsigned k
 static struct starpu_codelet cl12 =
 {
 	.modes = { STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u12, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u12, NULL},
@@ -128,7 +126,6 @@ static void create_task_12(starpu_data_handle_t dataA, unsigned k, unsigned i)
 static struct starpu_codelet cl21 =
 {
 	.modes = { STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u21, NULL},
@@ -170,7 +167,6 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 static struct starpu_codelet cl22 =
 {
 	.modes = { STARPU_R, STARPU_R, STARPU_RW },
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {dw_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dw_cublas_codelet_update_u22, NULL},
@@ -305,13 +301,13 @@ void dw_factoLU_tag(float *matA, unsigned size, unsigned ld, unsigned nblocks, u
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 

+ 1 - 1
examples/heat/dw_sparse_cg.c

@@ -431,7 +431,7 @@ void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	conjugate_gradient(nzvalA, vecb, vecx, nnz, nrow, colind, rowptr);
 }

+ 1 - 1
examples/heat/heat.c

@@ -788,7 +788,7 @@ int main(int argc, char **argv)
 		if (check)
 			solve_system(DIM, newsize, result, RefArray, Bformer, A, B);
 
-		starpu_helper_cublas_shutdown();
+		starpu_cublas_shutdown();
 		starpu_shutdown();
 		free_system(A, B, newsize, pinned);
 	}

+ 0 - 1
examples/incrementer/incrementer.c

@@ -66,7 +66,6 @@ int main(int argc, char **argv)
 
 	struct starpu_codelet cl =
 	{
-		.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 		.cpu_funcs = {cpu_codelet, NULL},
 #ifdef STARPU_USE_CUDA
 		.cuda_funcs = {cuda_codelet, NULL},

+ 5 - 5
examples/interface/complex_interface.c

@@ -66,10 +66,10 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, unsign
 	double *addr_imaginary = 0;
 	ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
 
-	addr_real = (double*) starpu_allocate_buffer_on_node(node, requested_memory);
+	addr_real = (double*) starpu_malloc_on_node(node, requested_memory);
 	if (!addr_real)
 		goto fail_real;
-	addr_imaginary = (double*) starpu_allocate_buffer_on_node(node, requested_memory);
+	addr_imaginary = (double*) starpu_malloc_on_node(node, requested_memory);
 	if (!addr_imaginary)
 		goto fail_imaginary;
 
@@ -80,7 +80,7 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, unsign
 	return 2*requested_memory;
 
 fail_imaginary:
-	starpu_free_buffer_on_node(node, (uintptr_t) addr_real, requested_memory);
+	starpu_free_on_node(node, (uintptr_t) addr_real, requested_memory);
 fail_real:
 	return -ENOMEM;
 }
@@ -90,8 +90,8 @@ static void complex_free_data_on_node(void *data_interface, unsigned node)
 	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
 	ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
 
-	starpu_free_buffer_on_node(node, (uintptr_t) complex_interface->real, requested_memory);
-	starpu_free_buffer_on_node(node, (uintptr_t) complex_interface->imaginary, requested_memory);
+	starpu_free_on_node(node, (uintptr_t) complex_interface->real, requested_memory);
+	starpu_free_on_node(node, (uintptr_t) complex_interface->imaginary, requested_memory);
 }
 
 static size_t complex_get_size(starpu_data_handle_t handle)

+ 2 - 2
examples/lu/lu_example.c

@@ -310,7 +310,7 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	init_matrix();
 
@@ -414,7 +414,7 @@ int main(int argc, char **argv)
 	starpu_free(A);
 
 	FPRINTF(stderr, "Shutting down\n");
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 
 	starpu_shutdown();
 

+ 2 - 2
examples/lu/xlu.c

@@ -256,13 +256,13 @@ int STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 

+ 2 - 2
examples/lu/xlu_implicit.c

@@ -156,13 +156,13 @@ int STARPU_LU(lu_decomposition)(TYPE *matA, unsigned size, unsigned ld, unsigned
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 

+ 2 - 2
examples/lu/xlu_implicit_pivot.c

@@ -210,13 +210,13 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 

+ 2 - 2
examples/lu/xlu_pivot.c

@@ -345,13 +345,13 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_vertical_block_filter_func,
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_block_filter_func,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 

+ 0 - 2
examples/mandelbrot/mandelbrot.c

@@ -373,7 +373,6 @@ static void compute_block_spmd(void *descr[], void *cl_arg)
 
 static struct starpu_codelet spmd_mandelbrot_cl =
 {
-	.where = STARPU_CPU|STARPU_OPENCL,
 	.type = STARPU_SPMD,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {compute_block_spmd, NULL},
@@ -385,7 +384,6 @@ static struct starpu_codelet spmd_mandelbrot_cl =
 
 static struct starpu_codelet mandelbrot_cl =
 {
-	.where = STARPU_CPU|STARPU_OPENCL,
 	.type = STARPU_SEQ,
 	.cpu_funcs = {compute_block, NULL},
 #ifdef STARPU_USE_OPENCL

+ 0 - 1
examples/matvecmult/matvecmult.c

@@ -129,7 +129,6 @@ static struct starpu_perfmodel starpu_matvecmult_model =
 
 static struct starpu_codelet cl =
 {
-	.where = STARPU_OPENCL,
 #ifdef STARPU_USE_OPENCL
         .opencl_funcs[0] = opencl_codelet,
 #endif

+ 4 - 5
examples/mult/xgemm.c

@@ -119,12 +119,12 @@ static void partition_mult_data(void)
 
 	struct starpu_data_filter vert;
 	memset(&vert, 0, sizeof(vert));
-	vert.filter_func = starpu_vertical_block_filter_func;
+	vert.filter_func = starpu_matrix_filter_vertical_block;
 	vert.nchildren = nslicesx;
 
 	struct starpu_data_filter horiz;
 	memset(&horiz, 0, sizeof(horiz));
-	horiz.filter_func = starpu_block_filter_func;
+	horiz.filter_func = starpu_matrix_filter_block;
 	horiz.nchildren = nslicesy;
 
 	starpu_data_partition(B_handle, &vert);
@@ -202,7 +202,6 @@ static struct starpu_perfmodel starpu_gemm_model =
 
 static struct starpu_codelet cl =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.type = STARPU_SEQ, /* changed to STARPU_SPMD if -spmd is passed */
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {cpu_mult, NULL},
@@ -297,7 +296,7 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	init_problem_data();
 	partition_mult_data();
@@ -357,7 +356,7 @@ enodev:
 	starpu_free(B);
 	starpu_free(C);
 
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 	starpu_shutdown();
 
 	return ret;

+ 0 - 1
examples/openmp/vector_scal.c

@@ -57,7 +57,6 @@ static struct starpu_perfmodel vector_scal_model =
 static struct starpu_codelet cl =
 {
 	.modes = { STARPU_RW },
-	.where = STARPU_CPU,
 	.type = STARPU_FORKJOIN,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {scal_cpu_func, NULL},

+ 1 - 2
examples/pi/pi.c

@@ -114,7 +114,7 @@ int main(int argc, char **argv)
 
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_block_filter_func_vector,
+		.filter_func = starpu_vector_filter_block,
 		.nchildren = ntasks
 	};
 	
@@ -129,7 +129,6 @@ int main(int argc, char **argv)
 
 	struct starpu_codelet cl =
 	{
-		.where = STARPU_CPU|STARPU_CUDA,
 		.cpu_funcs = {cpu_kernel, NULL},
 #ifdef STARPU_USE_CUDA
 		.cuda_funcs = {cuda_kernel, NULL},

+ 0 - 20
examples/pi/pi_redux.c

@@ -189,11 +189,6 @@ static void pi_func_cuda(void *descr[], void *cl_arg __attribute__ ((unused)))
 
 static struct starpu_codelet pi_cl =
 {
-	.where =
-#ifdef STARPU_HAVE_CURAND
-		STARPU_CUDA|
-#endif
-		STARPU_CPU,
 	.cpu_funcs = {pi_func_cpu, NULL},
 #ifdef STARPU_HAVE_CURAND
 	.cuda_funcs = {pi_func_cuda, NULL},
@@ -205,11 +200,6 @@ static struct starpu_codelet pi_cl =
 
 static struct starpu_codelet pi_cl_redux =
 {
-	.where =
-#ifdef STARPU_HAVE_CURAND
-		STARPU_CUDA|
-#endif
-		STARPU_CPU,
 	.cpu_funcs = {pi_func_cpu, NULL},
 #ifdef STARPU_HAVE_CURAND
 	.cuda_funcs = {pi_func_cuda, NULL},
@@ -240,11 +230,6 @@ static void init_cuda_func(void *descr[], void *cl_arg)
 
 static struct starpu_codelet init_codelet =
 {
-	.where =
-#ifdef STARPU_HAVE_CURAND
-		STARPU_CUDA|
-#endif
-		STARPU_CPU,
         .cpu_funcs = {init_cpu_func, NULL},
 #ifdef STARPU_HAVE_CURAND
         .cuda_funcs = {init_cuda_func, NULL},
@@ -282,11 +267,6 @@ static void redux_cpu_func(void *descr[], void *cl_arg)
 
 static struct starpu_codelet redux_codelet =
 {
-	.where =
-#ifdef STARPU_HAVE_CURAND
-		STARPU_CUDA|
-#endif
-		STARPU_CPU,
 	.cpu_funcs = {redux_cpu_func, NULL},
 #ifdef STARPU_HAVE_CURAND
 	.cuda_funcs = {redux_cuda_func, NULL},

+ 2 - 13
examples/pipeline/pipeline.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -77,7 +77,6 @@ static struct starpu_perfmodel pipeline_model_x =
 
 static struct starpu_codelet pipeline_codelet_x =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {pipeline_cpu_x, NULL},
 	.nbuffers = 1,
 	.modes = {STARPU_W},
@@ -113,11 +112,6 @@ static struct starpu_perfmodel pipeline_model_axpy =
 
 static struct starpu_codelet pipeline_codelet_axpy =
 {
-	.where = STARPU_CPU
-#ifdef STARPU_USE_CUDA
-		| STARPU_CUDA
-#endif
-		,
 	.cpu_funcs = {pipeline_cpu_axpy, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {pipeline_cublas_axpy, NULL},
@@ -160,11 +154,6 @@ static struct starpu_perfmodel pipeline_model_sum =
 
 static struct starpu_codelet pipeline_codelet_sum =
 {
-	.where = STARPU_CPU
-#ifdef STARPU_USE_CUDA
-		| STARPU_CUDA
-#endif
-		,
 	.cpu_funcs = {pipeline_cpu_sum, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {pipeline_cublas_sum, NULL},
@@ -186,7 +175,7 @@ int main(void)
 		exit(77);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	/* Initialize the K temporary buffers. No need to allocate it ourselves
 	 * Since it's the X and Y kernels which will fill the initial values. */

+ 2 - 3
examples/ppm_downscaler/yuv_downscaler.c

@@ -86,7 +86,6 @@ static void ds_kernel_cpu(void *descr[], __attribute__((unused)) void *arg)
 
 static struct starpu_codelet ds_codelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {ds_kernel_cpu, NULL},
 	.nbuffers = 2, /* input -> output */
 	.modes = {STARPU_R, STARPU_W},
@@ -96,13 +95,13 @@ static struct starpu_codelet ds_codelet =
 /* each block contains BLOCK_HEIGHT consecutive lines */
 static struct starpu_data_filter filter_y =
 {
-	.filter_func = starpu_block_filter_func,
+	.filter_func = starpu_matrix_filter_block,
 	.nchildren= HEIGHT/BLOCK_HEIGHT
 };
 
 static struct starpu_data_filter filter_uv =
 {
-	.filter_func = starpu_block_filter_func,
+	.filter_func = starpu_matrix_filter_block,
 	.nchildren = (HEIGHT/2)/BLOCK_HEIGHT
 };
 

+ 0 - 1
examples/profiling/profiling.c

@@ -50,7 +50,6 @@ int main(int argc, char **argv)
 
 	struct starpu_codelet cl =
 	{
-		.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 		.cpu_funcs = {sleep_codelet, NULL},
 		.cuda_funcs = {sleep_codelet, NULL},
 		.opencl_funcs = {sleep_codelet, NULL},

+ 2 - 2
examples/reductions/dot_product.c

@@ -333,7 +333,7 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
 #endif
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	unsigned long nelems = nblocks*entries_per_block;
 	size_t size = nelems*sizeof(float);
@@ -400,7 +400,7 @@ int main(int argc, char **argv)
 
 	FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
 
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 
 #ifdef STARPU_USE_OPENCL
         ret = starpu_opencl_unload_opencl(&opencl_program);

+ 0 - 3
examples/reductions/minmax_reduction.c

@@ -57,7 +57,6 @@ static void minmax_neutral_cpu_func(void *descr[], void *cl_arg)
 
 static struct starpu_codelet minmax_init_codelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {minmax_neutral_cpu_func, NULL},
 	.nbuffers = 1
 };
@@ -84,7 +83,6 @@ void minmax_redux_cpu_func(void *descr[], void *cl_arg)
 
 static struct starpu_codelet minmax_redux_codelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {minmax_redux_cpu_func, NULL},
 	.nbuffers = 2
 };
@@ -119,7 +117,6 @@ void minmax_cpu_func(void *descr[], void *cl_arg)
 
 static struct starpu_codelet minmax_codelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {minmax_cpu_func, NULL},
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_REDUX}

+ 0 - 1
examples/sched_ctx/sched_ctx.c

@@ -32,7 +32,6 @@ static void sched_ctx_func(void *descr[] __attribute__ ((unused)), void *arg __a
 
 static struct starpu_codelet sched_ctx_codelet =
 {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_funcs = {sched_ctx_func, NULL},
 	.cuda_funcs = {sched_ctx_func, NULL},
 	.opencl_funcs = {sched_ctx_func, NULL},

+ 0 - 1
examples/scheduler/dummy_sched.c

@@ -133,7 +133,6 @@ static void dummy_func(void *descr[] __attribute__ ((unused)), void *arg __attri
 
 static struct starpu_codelet dummy_codelet =
 {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_funcs = {dummy_func, NULL},
 	.cuda_funcs = {dummy_func, NULL},
         .opencl_funcs = {dummy_func, NULL},

+ 0 - 1
examples/spmd/vector_scal_spmd.c

@@ -81,7 +81,6 @@ static struct starpu_perfmodel vector_scal_model =
 static struct starpu_codelet cl =
 {
 	.modes = { STARPU_RW },
-	.where = STARPU_CPU,
 	.type = STARPU_SPMD,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {scal_cpu_func, NULL},

+ 3 - 4
examples/spmv/dw_block_spmv.c

@@ -121,17 +121,17 @@ void call_filters(void)
 	struct starpu_data_filter bcsr_f;
 	struct starpu_data_filter vector_in_f, vector_out_f;
 
-	bcsr_f.filter_func    = starpu_canonical_block_filter_bcsr;
+	bcsr_f.filter_func    = starpu_bcsr_filter_canonical_block;
 	bcsr_f.get_nchildren = get_bcsr_nchildren;
 	/* the children use a matrix interface ! */
 	bcsr_f.get_child_ops = get_bcsr_child_ops;
 
-	vector_in_f.filter_func = starpu_block_filter_func_vector;
+	vector_in_f.filter_func = starpu_vector_filter_block;
 	vector_in_f.nchildren  = size/c;
 	vector_in_f.get_nchildren  = NULL;
 	vector_in_f.get_child_ops  = NULL;
 	
-	vector_out_f.filter_func = starpu_block_filter_func_vector;
+	vector_out_f.filter_func = starpu_vector_filter_block;
 	vector_out_f.nchildren  = size/r;
 	vector_out_f.get_nchildren  = NULL;
 	vector_out_f.get_child_ops  = NULL;
@@ -147,7 +147,6 @@ unsigned totaltasks;
 
 struct starpu_codelet cl =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = { cpu_block_spmv, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cublas_block_spmv, NULL},

+ 1 - 2
examples/spmv/spmv.c

@@ -88,14 +88,13 @@ static struct starpu_data_filter csr_f =
 
 static struct starpu_data_filter vector_f =
 {
-	.filter_func = starpu_block_filter_func_vector,
+	.filter_func = starpu_vector_filter_block,
 	/* This value is defined later on */
 	.nchildren = -1,
 };
 
 static struct starpu_codelet spmv_cl =
 {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_funcs = {spmv_kernel_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {spmv_kernel_cuda, NULL},

+ 0 - 24
examples/stencil/stencil-kernels.c

@@ -456,14 +456,6 @@ static struct starpu_perfmodel cl_update_model =
 
 struct starpu_codelet cl_update =
 {
-	.where = 0 |
-#ifdef STARPU_USE_CUDA
-		STARPU_CUDA|
-#endif
-#ifdef STARPU_USE_OPENCL
-                STARPU_OPENCL|
-#endif
-		STARPU_CPU,
 	.cpu_funcs = {update_func_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {update_func_cuda, NULL},
@@ -664,14 +656,6 @@ static struct starpu_perfmodel save_cl_top_model =
 
 struct starpu_codelet save_cl_bottom =
 {
-	.where = 0 |
-#ifdef STARPU_USE_CUDA
-		STARPU_CUDA|
-#endif
-#ifdef STARPU_USE_OPENCL
-		STARPU_OPENCL|
-#endif
-		STARPU_CPU,
 	.cpu_funcs = {dummy_func_bottom_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dummy_func_bottom_cuda, NULL},
@@ -686,14 +670,6 @@ struct starpu_codelet save_cl_bottom =
 
 struct starpu_codelet save_cl_top =
 {
-	.where = 0|
-#ifdef STARPU_USE_CUDA
-		STARPU_CUDA|
-#endif
-#ifdef STARPU_USE_OPENCL
-		STARPU_OPENCL|
-#endif
-		STARPU_CPU,
 	.cpu_funcs = {dummy_func_top_cpu, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dummy_func_top_cuda, NULL},

+ 0 - 1
examples/stencil/stencil-tasks.c

@@ -217,7 +217,6 @@ static void null_func(void *descr[] __attribute__((unused)), void *arg __attribu
 static struct starpu_codelet null =
 {
 	.modes = { STARPU_W, STARPU_W },
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
 	.cpu_funcs = {null_func, NULL},
 	.cuda_funcs = {null_func, NULL},
 	.opencl_funcs = {null_func, NULL},

+ 0 - 1
examples/top/hello_world_top.c

@@ -100,7 +100,6 @@ struct starpu_codelet cl =
 {
 	/* this codelet may only be executed on a CPU, and its cpu
  	 * implementation is function "cpu_func" */
-	.where = STARPU_CPU,
 	.cpu_funcs = {cpu_func, NULL},
 	/* the codelet does not manipulate any data that is managed
 	 * by our DSM */

+ 2 - 2
gcc-plugin/examples/cholesky/cholesky.c

@@ -111,7 +111,7 @@ int main(int argc, char **argv)
 //	conf.calibrate = 1;
 #pragma starpu initialize
 
-        starpu_helper_cublas_init();
+        starpu_cublas_init();
 
 	float bmat[nblocks][nblocks][BLOCKSIZE * BLOCKSIZE] __heap;
 
@@ -247,7 +247,7 @@ int main(int argc, char **argv)
 		}
         }
 
-        starpu_helper_cublas_shutdown();
+        starpu_cublas_shutdown();
 #pragma starpu shutdown
 
 	assert(correctness);

+ 2 - 32
include/starpu.h

@@ -43,6 +43,7 @@ typedef unsigned long long uint64_t;
 #include <starpu_data.h>
 #include <starpu_data_interfaces.h>
 #include <starpu_data_filters.h>
+#include <starpu_stdlib.h>
 #include <starpu_perfmodel.h>
 #include <starpu_worker.h>
 #include <starpu_task.h>
@@ -62,6 +63,7 @@ typedef unsigned long long uint64_t;
 #include <starpu_profiling.h>
 #include <starpu_top.h>
 #include <starpu_fxt.h>
+#include <starpu_driver.h>
 
 #ifdef __cplusplus
 extern "C"
@@ -72,31 +74,6 @@ extern "C"
 #define main starpu_main
 #endif
 
-struct starpu_driver
-{
-	enum starpu_archtype type;
-	union
-	{
-		unsigned cpu_id;
-		unsigned cuda_id;
-#if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
-		cl_device_id opencl_id;
-#elif defined(STARPU_SIMGRID)
-		unsigned opencl_id;
-#endif
-		/*
-		 * HOWTO: add a new kind of device to the starpu_driver structure.
-		 * 1) Add a member to this union.
-		 * 2) Edit _starpu_launch_drivers() to make sure the driver is
-		 *    not always launched.
-		 * 3) Edit starpu_driver_run() so that it can handle another
-		 *    kind of architecture.
-		 * 4) Write _starpu_run_foobar() in the corresponding driver.
-		 * 5) Test the whole thing :)
-		 */
-	} id;
-};
-
 struct starpu_conf
 {
 	/* Will be initialized by starpu_conf_init */
@@ -173,18 +150,11 @@ int starpu_asynchronous_opencl_copy_disabled(void);
 
 void starpu_profiling_init();
 void starpu_display_stats();
-int starpu_driver_run(struct starpu_driver *d);
-void starpu_drivers_request_termination(void);
 
-int starpu_driver_init(struct starpu_driver *d);
-int starpu_driver_run_once(struct starpu_driver *d);
-int starpu_driver_deinit(struct starpu_driver *d);
 #ifdef __cplusplus
 }
 #endif
 
-#if defined(STARPU_USE_DEPRECATED_API)
 #include "starpu_deprecated_api.h"
-#endif /* STARPU_USE_DEPRECATED_API */
 
 #endif /* __STARPU_H__ */

+ 2 - 2
include/starpu_cublas.h

@@ -23,8 +23,8 @@ extern "C"
 {
 #endif
 /* Some helper functions for application using CUBLAS kernels */
-void starpu_helper_cublas_init(void);
-void starpu_helper_cublas_shutdown(void);
+void starpu_cublas_init(void);
+void starpu_cublas_shutdown(void);
 
 #ifdef __cplusplus
 }

+ 0 - 3
include/starpu_data.h

@@ -85,9 +85,6 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, unsigned node, e
 void starpu_data_release(starpu_data_handle_t handle);
 void starpu_data_release_on_node(starpu_data_handle_t handle, unsigned node);
 
-void starpu_malloc_set_align(size_t align);
-int starpu_malloc(void **A, size_t dim);
-int starpu_free(void *A);
 void starpu_memory_display_stats();
 
 /* XXX These macros are provided to avoid breaking old codes. But consider

+ 16 - 16
include/starpu_data_filters.h

@@ -58,28 +58,28 @@ void starpu_data_vmap_filters(starpu_data_handle_t root_data, unsigned nfilters,
 /* a few examples of filters */
 
 /* for BCSR */
-void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_csr_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /* (filters for matrix interface) */
-void starpu_block_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_block_shadow_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_vertical_block_shadow_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_matrix_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_matrix_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_matrix_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_matrix_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /* for vector */
-void starpu_block_filter_func_vector(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_block_shadow_filter_func_vector(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_vector_list_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_vector_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_vector_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_vector_filter_list(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /* for block */
-void starpu_block_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_block_shadow_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_vertical_block_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_vertical_block_shadow_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_depth_block_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-void starpu_depth_block_shadow_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_block_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_block_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_block_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_block_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_block_filter_depth_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 #ifdef __cplusplus
 }

+ 0 - 5
include/starpu_data_interfaces.h

@@ -135,11 +135,6 @@ int starpu_data_interface_get_next_id(void);
 void starpu_data_register(starpu_data_handle_t *handleptr, unsigned home_node, void *data_interface, struct starpu_data_interface_ops *ops);
 void starpu_data_register_same(starpu_data_handle_t *handledst, starpu_data_handle_t handlesrc);
 
-/* Allocate SIZE bytes on node NODE */
-uintptr_t starpu_allocate_buffer_on_node(unsigned dst_node, size_t size);
-/* Free ADDR on node NODE */
-void starpu_free_buffer_on_node(unsigned dst_node, uintptr_t addr, size_t size);
-
 /* Return the pointer associated with HANDLE on node NODE or NULL if HANDLE's
  * interface does not support this operation or data for this handle is not
  * allocated on that node. */

+ 35 - 1
include/starpu_deprecated_api.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -23,8 +23,11 @@ extern "C"
 {
 #endif
 
+#if defined(STARPU_USE_DEPRECATED_API) || defined(STARPU_USE_DEPRECATED_ONE_ZERO_API)
 #warning Your application is using former types. You may want to update to use the latest API, by using tools/dev/rename.sh.
+#endif /* defined(STARPU_USE_DEPRECATED_API) || defined(STARPU_USE_DEPRECATED_ONE_ZERO_API) */
 
+#ifdef STARPU_USE_DEPRECATED_API
 typedef starpu_data_handle_t starpu_data_handle;
 typedef struct starpu_block_interface starpu_block_interface_t;
 typedef struct starpu_matrix_interface starpu_matrix_interface_t;
@@ -56,6 +59,37 @@ typedef enum starpu_access_mode starpu_access_mode;
 #define starpu_pack_cl_args   	       starpu_codelet_pack_args
 #define starpu_task_deinit	       starpu_task_clean
 
+#endif /* STARPU_USE_DEPRECATED_API */
+
+#ifdef STARPU_USE_DEPRECATED_ONE_ZERO_API
+
+#define starpu_allocate_buffer_on_node	starpu_malloc_on_node
+#define starpu_free_buffer_on_node	starpu_free_on_node
+#define starpu_helper_cublas_init	starpu_cublas_init
+#define starpu_helper_cublas_shutdown	starpu_cublas_shutdown
+
+#define starpu_canonical_block_filter_bcsr	starpu_bcsr_filter_canonical_block
+#define starpu_vertical_block_filter_func_csr	starpu_csr_filter_vertical_block
+
+#define starpu_block_filter_func			starpu_matrix_filter_block
+#define starpu_block_shadow_filter_func			starpu_matrix_filter_block_shadow
+#define starpu_vertical_block_filter_func		starpu_matrix_filter_vertical_block
+#define starpu_vertical_block_shadow_filter_func	starpu_matrix_filter_vertical_block_shadow
+
+#define starpu_block_filter_func_vector		starpu_vector_filter_block
+#define starpu_block_shadow_filter_func_vector	starpu_vector_filter_block_shadow
+#define starpu_vector_list_filter_func		starpu_vector_filter_list
+#define starpu_vector_divide_in_2_filter_func	starpu_vector_filter_divide_in_2
+
+#define starpu_block_filter_func_block			starpu_block_filter_block
+#define starpu_block_shadow_filter_func_block		starpu_block_filter_block_shadow
+#define starpu_vertical_block_filter_func_block		starpu_block_filter_vertical_block
+#define starpu_vertical_block_shadow_filter_func_block	starpu_block_filter_vertical_block_shadow
+#define starpu_depth_block_filter_func_block		starpu_block_filter_depth_block
+#define starpu_depth_block_shadow_filter_func_block	starpu_block_filter_depth_block_shadow
+
+#endif /* STARPU_USE_DEPRECATED_ONE_ZERO_API */
+
 #ifdef __cplusplus
 }
 #endif

+ 67 - 0
include/starpu_driver.h

@@ -0,0 +1,67 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_DRIVER_H__
+#define __STARPU_DRIVER_H__
+
+#include <starpu_config.h>
+#if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
+#include <starpu_opencl.h>
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+struct starpu_driver
+{
+	enum starpu_archtype type;
+	union
+	{
+		unsigned cpu_id;
+		unsigned cuda_id;
+#if defined(STARPU_USE_OPENCL) && !defined(__CUDACC__)
+		cl_device_id opencl_id;
+#elif defined(STARPU_SIMGRID)
+		unsigned opencl_id;
+#endif
+		/*
+		 * HOWTO: add a new kind of device to the starpu_driver structure.
+		 * 1) Add a member to this union.
+		 * 2) Edit _starpu_launch_drivers() to make sure the driver is
+		 *    not always launched.
+		 * 3) Edit starpu_driver_run() so that it can handle another
+		 *    kind of architecture.
+		 * 4) Write _starpu_run_foobar() in the corresponding driver.
+		 * 5) Test the whole thing :)
+		 */
+	} id;
+};
+
+int starpu_driver_run(struct starpu_driver *d);
+void starpu_drivers_request_termination(void);
+
+int starpu_driver_init(struct starpu_driver *d);
+int starpu_driver_run_once(struct starpu_driver *d);
+int starpu_driver_deinit(struct starpu_driver *d);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_DRIVER_H__ */

+ 3 - 1
include/starpu_scheduler.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -183,6 +183,8 @@ int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node);
 
 /* Return the current date in us */
 double starpu_timing_now(void);
+/* Returns the perfmodel footprint for the task */
+uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Returns expected task duration in us */
 double starpu_task_expected_length(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl);
 /* Returns an estimated speedup factor relative to CPU speed */

+ 41 - 0
include/starpu_stdlib.h

@@ -0,0 +1,41 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_STDLIB_H__
+#define __STARPU_STDLIB_H__
+
+#include <starpu.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void starpu_malloc_set_align(size_t align);
+int starpu_malloc(void **A, size_t dim);
+int starpu_free(void *A);
+
+/* Allocate SIZE bytes on node NODE */
+uintptr_t starpu_malloc_on_node(unsigned dst_node, size_t size);
+/* Free ADDR on node NODE */
+void starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_STDLIB_H__ */

+ 1 - 1
include/starpu_util.h

@@ -50,7 +50,7 @@ extern "C"
 #  define STARPU_ATTRIBUTE_INTERNAL
 #endif
 
-#if STARPU_GNUC_PREREQ(3, 1) && !defined(BUILDING_STARPU) && !defined(STARPU_USE_DEPRECATED_API)
+#if STARPU_GNUC_PREREQ(3, 1) && !defined(BUILDING_STARPU) && !defined(STARPU_USE_DEPRECATED_API) && !defined(STARPU_USE_DEPRECATED_ONE_ZERO_API)
 #define STARPU_DEPRECATED  __attribute__((__deprecated__))
 #else
 #define STARPU_DEPRECATED

+ 17 - 6
mpi/examples/complex/mpi_complex.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -37,6 +37,10 @@ int main(int argc, char **argv)
 	int ret;
 	int compare;
 
+	starpu_data_handle_t handle;
+	starpu_data_handle_t handle2;
+	starpu_data_handle_t foo_handle;
+
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	ret = starpu_mpi_init(&argc, &argv, 1);
@@ -55,11 +59,9 @@ int main(int argc, char **argv)
 		{
 			double real[2] = {4.0, 2.0};
 			double imaginary[2] = {7.0, 9.0};
-			starpu_data_handle_t handle;
 
 			double real2[2] = {14.0, 12.0};
 			double imaginary2[2] = {17.0, 19.0};
-			starpu_data_handle_t handle2;
 
 			int *compare_ptr = &compare;
 
@@ -76,7 +78,6 @@ int main(int argc, char **argv)
 			{
 				// We send a dummy variable only to check communication with predefined datatypes
 				int foo=12;
-				starpu_data_handle_t foo_handle;
 				starpu_variable_data_register(&foo_handle, 0, (uintptr_t)&foo, sizeof(foo));
 				starpu_mpi_isend_detached(foo_handle, 1, 40, MPI_COMM_WORLD, NULL, NULL);
 				starpu_insert_task(&foo_display, STARPU_R, foo_handle, 0);
@@ -86,7 +87,6 @@ int main(int argc, char **argv)
 		{
 			double real[2] = {0.0, 0.0};
 			double imaginary[2] = {0.0, 0.0};
-			starpu_data_handle_t handle;
 
 			starpu_complex_data_register(&handle, 0, real, imaginary, 2);
 			starpu_mpi_irecv_detached(handle, 0, 10, MPI_COMM_WORLD, NULL, NULL);
@@ -96,7 +96,6 @@ int main(int argc, char **argv)
 			{
 				// We send a dummy variable only to check communication with predefined datatypes
 				int foo=12;
-				starpu_data_handle_t foo_handle;
 				starpu_variable_data_register(&foo_handle, -1, (uintptr_t)NULL, sizeof(foo));
 				starpu_mpi_irecv_detached(foo_handle, 0, 40, MPI_COMM_WORLD, NULL, NULL);
 				starpu_insert_task(&foo_display, STARPU_R, foo_handle, 0);
@@ -104,7 +103,19 @@ int main(int argc, char **argv)
 
 		}
 	}
+
 	starpu_task_wait_for_all();
+
+	if (rank == 0)
+	{
+		starpu_data_unregister(handle2);
+	}
+	if (rank == 0 || rank == 1)
+	{
+		starpu_data_unregister(handle);
+		starpu_data_unregister(foo_handle);
+	}
+
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 4 - 4
mpi/examples/matrix_decomposition/mpi_cholesky.c

@@ -41,23 +41,23 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	parse_args(argc, argv, nodes);
 
 	matrix_init(&bmat, rank, nodes, 1);
 	matrix_display(bmat, rank);
 
-	dw_cholesky(bmat, size, size/nblocks, nblocks, rank, nodes, &timing, &flops);
+	dw_cholesky(bmat, size/nblocks, rank, nodes, &timing, &flops);
 
 	starpu_mpi_shutdown();
 
 	matrix_display(bmat, rank);
 
-	dw_cholesky_check_computation(bmat, size, rank, nodes, &correctness, &flops);
+	dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops);
 
 	matrix_free(&bmat, rank, nodes, 1);
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 	starpu_shutdown();
 
 	assert(correctness);

+ 2 - 5
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -29,7 +29,6 @@
 
 static struct starpu_codelet cl11 =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u11, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u11, NULL},
@@ -41,7 +40,6 @@ static struct starpu_codelet cl11 =
 
 static struct starpu_codelet cl21 =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u21, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u21, NULL},
@@ -53,7 +51,6 @@ static struct starpu_codelet cl21 =
 
 static struct starpu_codelet cl22 =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 	.cpu_funcs = {chol_cpu_codelet_update_u22, NULL},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {chol_cublas_codelet_update_u22, NULL},
@@ -67,7 +64,7 @@ static struct starpu_codelet cl22 =
  *	code to bootstrap the factorization
  *	and construct the DAG
  */
-void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, int rank, int nodes, double *timing, double *flops)
+void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops)
 {
 	struct timeval start;
 	struct timeval end;
@@ -169,7 +166,7 @@ void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, in
 	}
 }
 
-void dw_cholesky_check_computation(float ***matA, unsigned size, int rank, int nodes, int *correctness, double *flops)
+void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops)
 {
 	unsigned i,j,x,y;
 	float *rmat = malloc(size*size*sizeof(float));

+ 2 - 2
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.h

@@ -23,8 +23,8 @@
  *	code to bootstrap the factorization
  *	and construct the DAG
  */
-void dw_cholesky(float ***matA, unsigned size, unsigned ld, unsigned nblocks, int rank, int nodes, double *timing, double *flops);
+void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops);
 
-void dw_cholesky_check_computation(float ***matA, unsigned size, int rank, int nodes, int *correctness, double *flops);
+void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops);
 
 #endif /* __MPI_CHOLESKY_CODELETS_H__ */

+ 3 - 3
mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c

@@ -40,18 +40,18 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 	MPI_Comm_size(MPI_COMM_WORLD, &nodes);
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	parse_args(argc, argv, nodes);
 
 	matrix_init(&bmat, rank, nodes, 0);
 
-	dw_cholesky(bmat, size, size/nblocks, nblocks, rank, nodes, &timing, &flops);
+	dw_cholesky(bmat, size/nblocks, rank, nodes, &timing, &flops);
 
 	starpu_mpi_shutdown();
 
 	matrix_free(&bmat, rank, nodes, 0);
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 	starpu_shutdown();
 
 	if (rank == 0)

+ 16 - 16
mpi/examples/mpi_lu/plu_example.c

@@ -117,16 +117,16 @@ unsigned STARPU_PLU(display_flag)(void)
 	return display;
 }
 
-static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nblocks)
+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
 {
-	const unsigned block_size = (size/nblocks);
+	const unsigned block_size = (psize/pnblocks);
 
 	unsigned i, j;
 	for (i = 0; i < block_size; i++)
-	for (j = 0; j < block_size; j++)
-	{
-		blockptr[j+i*block_size] = (TYPE)starpu_drand48();
-	}
+	     for (j = 0; j < block_size; j++)
+	     {
+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
+	     }
 }
 
 #ifdef SINGLE_TMP11
@@ -163,15 +163,15 @@ starpu_data_handle_t STARPU_PLU(get_tmp_21_block_handle)(unsigned i, unsigned k)
 }
 #endif
 
-static unsigned tmp_11_block_is_needed(int rank, unsigned nblocks, unsigned k)
+static unsigned tmp_11_block_is_needed(int rank, unsigned pnblocks, unsigned k)
 {
 	return 1;
 }
 
-static unsigned tmp_12_block_is_needed(int rank, unsigned nblocks, unsigned j)
+static unsigned tmp_12_block_is_needed(int rank, unsigned pnblocks, unsigned j)
 {
 	unsigned i;
-	for (i = 1; i < nblocks; i++)
+	for (i = 1; i < pnblocks; i++)
 	{
 		if (get_block_rank(i, j) == rank)
 			return 1;
@@ -180,10 +180,10 @@ static unsigned tmp_12_block_is_needed(int rank, unsigned nblocks, unsigned j)
 	return 0;
 }
 
-static unsigned tmp_21_block_is_needed(int rank, unsigned nblocks, unsigned i)
+static unsigned tmp_21_block_is_needed(int rank, unsigned pnblocks, unsigned i)
 {
 	unsigned j;
-	for (j = 1; j < nblocks; j++)
+	for (j = 1; j < pnblocks; j++)
 	{
 		if (get_block_rank(i, j) == rank)
 			return 1;
@@ -373,7 +373,7 @@ starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
 	return dataA_handles[j+i*nblocks];
 }
 
-static void display_grid(int rank, unsigned nblocks)
+static void display_grid(int rank, unsigned pnblocks)
 {
 	if (!display)
 		return;
@@ -383,9 +383,9 @@ static void display_grid(int rank, unsigned nblocks)
 		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
 
 		unsigned i, j;
-		for (j = 0; j < nblocks; j++)
+		for (j = 0; j < pnblocks; j++)
 		{
-			for (i = 0; i < nblocks; i++)
+			for (i = 0; i < pnblocks; i++)
 			{
 				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
 				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
@@ -432,7 +432,7 @@ int main(int argc, char **argv)
 
 	STARPU_ASSERT(p*q == world_size);
 
-	starpu_helper_cublas_init();
+	starpu_cublas_init();
 
 	int barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
@@ -563,7 +563,7 @@ int main(int argc, char **argv)
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 
-	starpu_helper_cublas_shutdown();
+	starpu_cublas_shutdown();
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 0 - 1
mpi/examples/stencil/stencil5.c

@@ -31,7 +31,6 @@ void stencil5_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
 struct starpu_codelet stencil5_cl =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {stencil5_cpu, NULL},
 	.nbuffers = 5,
 	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}

+ 2 - 1
mpi/src/starpu_mpi.c

@@ -175,6 +175,7 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 		starpu_data_handle_t size_handle;
 		starpu_variable_data_register(&size_handle, 0, (uintptr_t)&(size), sizeof(size));
 		starpu_mpi_send(size_handle, dest, mpi_tag, comm);
+		starpu_data_unregister(size_handle);
 	}
 
 	return _starpu_mpi_isend_irecv_common(data_handle, size, dest, mpi_tag, comm, detached, callback, arg, SEND_REQ, _starpu_mpi_isend_pack_func, STARPU_R);
@@ -590,7 +591,7 @@ int starpu_mpi_barrier(MPI_Comm comm)
 
 	ret = barrier_req->ret;
 
-	//free(waiting_req);
+	free(barrier_req);
 	_STARPU_MPI_LOG_OUT();
 	return ret;
 }

+ 51 - 2
mpi/src/starpu_mpi_insert_task.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2011-2013  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -459,6 +459,19 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		{
 			va_arg(varg_list, int);
 		}
+		else if (arg_type==STARPU_HYPERVISOR_TAG)
+		{
+			(void)va_arg(varg_list, int);
+		}
+		else if (arg_type==STARPU_FLOPS)
+		{
+			(void)va_arg(varg_list, double);
+		}
+		else if (arg_type==STARPU_TAG)
+		{
+			STARPU_ASSERT_MSG(0, "STARPU_TAG is not supported in MPI mode\n");
+		}
+
 	}
 	va_end(varg_list);
 
@@ -559,6 +572,18 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		{
 			va_arg(varg_list, starpu_data_handle_t);
 		}
+		else if (arg_type==STARPU_HYPERVISOR_TAG)
+		{
+			(void)va_arg(varg_list, int);
+		}
+		else if (arg_type==STARPU_FLOPS)
+		{
+			(void)va_arg(varg_list, double);
+		}
+		else if (arg_type==STARPU_TAG)
+		{
+			STARPU_ASSERT_MSG(0, "STARPU_TAG is not supported in MPI mode\n");
+		}
 	}
 	va_end(varg_list);
 
@@ -628,7 +653,19 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 			{
 				va_arg(varg_list, starpu_data_handle_t);
 			}
-		}
+			else if (arg_type==STARPU_HYPERVISOR_TAG)
+			{
+				(void)va_arg(varg_list, int);
+			}
+			else if (arg_type==STARPU_FLOPS)
+			{
+				(void)va_arg(varg_list, double);
+			}
+			else if (arg_type==STARPU_TAG)
+			{
+				STARPU_ASSERT_MSG(0, "STARPU_TAG is not supported in MPI mode\n");
+			}
+			}
 		va_end(varg_list);
 	}
 
@@ -686,6 +723,18 @@ int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 		{
 			va_arg(varg_list, starpu_data_handle_t);
 		}
+		else if (arg_type==STARPU_HYPERVISOR_TAG)
+		{
+			(void)va_arg(varg_list, int);
+		}
+		else if (arg_type==STARPU_FLOPS)
+		{
+			(void)va_arg(varg_list, double);
+		}
+		else if (arg_type==STARPU_TAG)
+		{
+			STARPU_ASSERT_MSG(0, "STARPU_TAG is not supported in MPI mode\n");
+		}
 	}
 
 	va_end(varg_list);

+ 0 - 1
mpi/tests/insert_task.c

@@ -29,7 +29,6 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
 struct starpu_codelet mycodelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_R}

+ 0 - 1
mpi/tests/insert_task_block.c

@@ -46,7 +46,6 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
 struct starpu_codelet mycodelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.nbuffers = 1,
 	.modes = {STARPU_RW}

+ 2 - 5
mpi/tests/insert_task_cache.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,7 +33,6 @@ void func_cpu(__attribute__ ((unused)) void *descr[], __attribute__ ((unused)) v
 
 struct starpu_codelet mycodelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_R}
@@ -53,9 +52,8 @@ void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 	int ret;
 	unsigned v[2][N];
 	starpu_data_handle_t data_handles[2];
-	char *string;
+	char string[50];
 
-	string = malloc(50);
 	sprintf(string, "STARPU_MPI_CACHE=%d", enabled);
 	putenv(string);
 
@@ -104,7 +102,6 @@ void test_cache(int rank, int size, int enabled, size_t *comm_amount)
 	starpu_mpi_comm_amounts_retrieve(comm_amount);
 	starpu_mpi_shutdown();
 	starpu_shutdown();
-	free(string);
 }
 
 int main(int argc, char **argv)

+ 0 - 5
mpi/tests/insert_task_owner.c

@@ -32,7 +32,6 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
 struct starpu_codelet mycodelet_r_w =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_W}
@@ -40,7 +39,6 @@ struct starpu_codelet mycodelet_r_w =
 
 struct starpu_codelet mycodelet_rw_r =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_R}
@@ -48,7 +46,6 @@ struct starpu_codelet mycodelet_rw_r =
 
 struct starpu_codelet mycodelet_rw_rw =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_RW}
@@ -56,7 +53,6 @@ struct starpu_codelet mycodelet_rw_rw =
 
 struct starpu_codelet mycodelet_w_r =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.nbuffers = 2,
 	.modes = {STARPU_W, STARPU_R}
@@ -64,7 +60,6 @@ struct starpu_codelet mycodelet_w_r =
 
 struct starpu_codelet mycodelet_r_r =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_R}

+ 0 - 1
mpi/tests/insert_task_owner2.c

@@ -39,7 +39,6 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
 struct starpu_codelet mycodelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.nbuffers = 4,
 	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_W}

+ 0 - 1
mpi/tests/insert_task_owner_data.c

@@ -29,7 +29,6 @@ void func_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
 struct starpu_codelet mycodelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {func_cpu, NULL},
 	.nbuffers = 2,
 	.modes = {STARPU_RW, STARPU_RW}

+ 1 - 0
mpi/tests/mpi_detached_tag.c

@@ -74,6 +74,7 @@ int main(int argc, char **argv)
 		starpu_tag_wait(tag);
 	}
 
+	starpu_data_unregister(tab_handle);
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 1 - 0
mpi/tests/mpi_irecv.c

@@ -73,6 +73,7 @@ int main(int argc, char **argv)
 		}
 	}
 
+	starpu_data_unregister(tab_handle);
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 1 - 0
mpi/tests/mpi_irecv_detached.c

@@ -91,6 +91,7 @@ int main(int argc, char **argv)
 		}
 	}
 
+	starpu_data_unregister(tab_handle);
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 1 - 0
mpi/tests/mpi_isend.c

@@ -74,6 +74,7 @@ int main(int argc, char **argv)
 		}
 	}
 
+	starpu_data_unregister(tab_handle);
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 1 - 0
mpi/tests/mpi_isend_detached.c

@@ -96,6 +96,7 @@ int main(int argc, char **argv)
 		}
 	}
 
+	starpu_data_unregister(tab_handle);
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 1 - 0
mpi/tests/mpi_probe.c

@@ -91,6 +91,7 @@ int main(int argc, char **argv)
 		}
 	}
 
+	starpu_data_unregister(tab_handle);
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 

+ 0 - 4
mpi/tests/mpi_reduction.c

@@ -24,7 +24,6 @@ extern void display_cpu_func(void *descr[], void *cl_arg);
 
 static struct starpu_codelet init_codelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {init_cpu_func, NULL},
 	.nbuffers = 1,
 	.name = "init_codelet"
@@ -32,7 +31,6 @@ static struct starpu_codelet init_codelet =
 
 static struct starpu_codelet redux_codelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {redux_cpu_func, NULL},
 	.nbuffers = 2,
 	.name = "redux_codelet"
@@ -40,7 +38,6 @@ static struct starpu_codelet redux_codelet =
 
 static struct starpu_codelet dot_codelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {dot_cpu_func, NULL},
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_REDUX},
@@ -49,7 +46,6 @@ static struct starpu_codelet dot_codelet =
 
 static struct starpu_codelet display_codelet =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {display_cpu_func, NULL},
 	.nbuffers = 1,
 	.modes = {STARPU_R},

+ 0 - 1
mpi/tests/mpi_scatter_gather.c

@@ -48,7 +48,6 @@ void cpu_codelet(void *descr[], void *_args)
 
 static struct starpu_codelet cl =
 {
-	.where = STARPU_CPU,
 	.cpu_funcs = {cpu_codelet, NULL},
 	.nbuffers = 1,
 	.modes = {STARPU_RW},

+ 0 - 1
mpi/tests/ring.c

@@ -39,7 +39,6 @@ void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
 static struct starpu_codelet increment_cl =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda, NULL},
 #endif

+ 0 - 1
mpi/tests/ring_async.c

@@ -39,7 +39,6 @@ void increment_cpu(void *descr[], __attribute__ ((unused)) void *_args)
 
 static struct starpu_codelet increment_cl =
 {
-	.where = STARPU_CPU|STARPU_CUDA,
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {increment_cuda, NULL},
 #endif

+ 0 - 0
mpi/tests/ring_async_implicit.c


Some files were not shown because too many files changed in this diff