浏览代码

merge trunk@7007:7057

Nathalie Furmento 13 年之前
父节点
当前提交
4011da62a7
共有 64 个文件被更改,包括 1660 次插入539 次删除
  1. 1 0
      ChangeLog
  2. 2 0
      configure.ac
  3. 119 37
      doc/chapters/basic-api.texi
  4. 1 1
      doc/chapters/perf-feedback.texi
  5. 2 0
      examples/Makefile.am
  6. 10 3
      examples/audio/starpu_audio_processing.c
  7. 2 1
      examples/axpy/axpy.c
  8. 2 2
      examples/cholesky/cholesky_kernels.c
  9. 26 8
      examples/filters/shadow.c
  10. 292 0
      examples/filters/shadow2d.c
  11. 332 0
      examples/filters/shadow3d.c
  12. 5 1
      examples/gl_interop/gl_interop.c
  13. 5 1
      examples/gl_interop/gl_interop_idle.c
  14. 2 1
      examples/heat/dw_factolu.h
  15. 7 7
      examples/heat/dw_factolu_kernels.c
  16. 1 0
      examples/heat/heat.c
  17. 13 13
      examples/lu/xlu_kernels.c
  18. 8 6
      examples/pi/pi_redux.c
  19. 7 11
      examples/reductions/dot_product.c
  20. 7 7
      gcc-plugin/examples/cholesky/cholesky_kernels.c
  21. 9 1
      include/starpu_data_filters.h
  22. 3 1
      include/starpu_deprecated_api.h
  23. 17 21
      include/starpu_perfmodel.h
  24. 8 7
      mpi/examples/cholesky/mpi_cholesky_kernels.c
  25. 7 7
      mpi/examples/mpi_lu/pxlu_kernels.c
  26. 4 1
      mpi/examples/mpi_lu/pxlu_kernels.h
  27. 1 0
      src/Makefile.am
  28. 6 0
      src/core/perfmodel/perfmodel.h
  29. 12 1
      src/core/perfmodel/perfmodel_bus.c
  30. 45 45
      src/core/perfmodel/perfmodel_history.c
  31. 254 0
      src/core/perfmodel/perfmodel_print.c
  32. 5 5
      src/core/perfmodel/regression.c
  33. 1 1
      src/core/perfmodel/regression.h
  34. 18 4
      src/core/sched_policy.c
  35. 4 6
      src/core/topology.c
  36. 182 2
      src/datawizard/interfaces/block_filters.c
  37. 8 2
      src/datawizard/interfaces/data_interface.c
  38. 74 1
      src/datawizard/interfaces/matrix_filters.c
  39. 1 1
      src/datawizard/interfaces/matrix_interface.c
  40. 3 0
      src/drivers/cpu/driver_cpu.c
  41. 11 0
      src/drivers/cuda/driver_cuda.c
  42. 1 1
      src/sched_policies/eager_central_policy.c
  43. 2 4
      src/util/starpu_cublas.c
  44. 1 2
      tests/datawizard/acquire_cb.c
  45. 5 1
      tests/datawizard/handle_to_pointer.c
  46. 12 12
      tests/datawizard/increment_redux.c
  47. 11 11
      tests/datawizard/increment_redux_lazy.c
  48. 12 12
      tests/datawizard/increment_redux_v2.c
  49. 3 2
      tests/datawizard/lazy_allocation.c
  50. 6 3
      tests/datawizard/manual_reduction.c
  51. 7 2
      tests/datawizard/write_only_tmp_buffer.c
  52. 4 4
      tests/datawizard/wt_broadcast.c
  53. 4 4
      tests/datawizard/wt_host.c
  54. 1 1
      tests/experiments/latency/cuda_latency.c
  55. 7 6
      tests/microbenchs/matrix_as_vector.c
  56. 6 3
      tests/perfmodels/non_linear_regression_based.c
  57. 5 2
      tests/perfmodels/regression_based.c
  58. 3 3
      tests/perfmodels/valid_model.c
  59. 8 0
      tools/dev/rename.sed
  60. 2 2
      tools/dev/rename.sh
  61. 5 1
      tools/starpu_calibrate_bus.c
  62. 6 1
      tools/starpu_machine_display.c
  63. 23 248
      tools/starpu_perfmodel_display.c
  64. 9 9
      tools/starpu_perfmodel_plot.c

+ 1 - 0
ChangeLog

@@ -38,6 +38,7 @@ Changes:
   * Fix CPU binding for optimized CPU-GPU transfers.
   * The cache of starpu_mpi_insert_task is fixed and thus now enabled by
     default.
+  * Standardize objects name in the performance model API
 
 Small changes:
   * STARPU_NCPU should now be used instead of STARPU_NCPUS. STARPU_NCPUS is

+ 2 - 0
configure.ac

@@ -509,6 +509,8 @@ if test x$enable_cuda = xyes; then
 		NVCCFLAGS="${NVCCFLAGS} -m64"
 		AC_SUBST(NVCCFLAGS)
 	fi
+
+	AC_CHECK_HEADERS([cuda_gl_interop.h])
 fi
 
 have_magma=no

+ 119 - 37
doc/chapters/basic-api.texi

@@ -1064,7 +1064,7 @@ subdata according to the filter @var{f}, as shown in the following example:
 @cartouche
 @smallexample
 struct starpu_data_filter f = @{
-    .filter_func = starpu_vertical_block_filter_func,
+    .filter_func = starpu_block_filter_func,
     .nchildren = nslicesx,
     .get_nchildren = NULL,
     .get_child_ops = NULL
@@ -1130,38 +1130,16 @@ starpu_data_filter.
 @subsection Predefined filter functions
 
 @menu
-* Partitioning BCSR Data::
-* Partitioning BLAS interface::
 * Partitioning Vector Data::
-* Partitioning Block Data::
+* Partitioning Matrix Data::
+* Partitioning 3D Matrix Data::
+* Partitioning BCSR Data::
 @end menu
 
 This section gives a partial list of the predefined partitioning functions.
 Examples on how to use them are shown in @ref{Partitioning Data}. The complete
 list can be found in @code{starpu_data_filters.h} .
 
-@node Partitioning BCSR Data
-@subsubsection Partitioning BCSR Data
-
-@deftypefun void starpu_canonical_block_filter_bcsr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
-This partitions a block-sparse matrix into dense matrices.
-@end deftypefun
-
-@deftypefun void starpu_vertical_block_filter_func_csr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
-This partitions a block-sparse matrix into vertical block-sparse matrices.
-@end deftypefun
-
-@node Partitioning BLAS interface
-@subsubsection Partitioning BLAS interface
-
-@deftypefun void starpu_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
-This partitions a dense Matrix into horizontal blocks.
-@end deftypefun
-
-@deftypefun void starpu_vertical_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
-This partitions a dense Matrix into vertical blocks.
-@end deftypefun
-
 @node Partitioning Vector Data
 @subsubsection Partitioning Vector Data
 
@@ -1174,7 +1152,7 @@ vector represented by @var{father_interface} once partitioned in
 @deftypefun void starpu_block_shadow_filter_func_vector (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
 Return in @code{*@var{child_interface}} the @var{id}th element of the
 vector represented by @var{father_interface} once partitioned in
-@var{nparts} chunks of equal size with a shadow border @code{filter_arg_ptr}
+@var{nparts} chunks of equal size with a shadow border @code{filter_arg_ptr}, thus getting a vector of size (n-2*shadow)/nparts+2*shadow 
 
 The @code{filter_arg_ptr} field must be the shadow size casted into @code{void*}.
 
@@ -1203,11 +1181,107 @@ chunks of equal size, ignoring @var{nparts}.  Thus, @var{id} must be
 @end deftypefun
 
 
-@node Partitioning Block Data
-@subsubsection Partitioning Block Data
+@node Partitioning Matrix Data
+@subsubsection Partitioning Matrix Data
+
+@deftypefun void starpu_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+This partitions a dense Matrix along the x dimension, thus getting (x/nparts,y)
+matrices. If nparts does not divide x, the last submatrix contains the
+remainder.
+@end deftypefun
+
+@deftypefun void starpu_block_shadow_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+This partitions a dense Matrix along the x dimension, with a shadow border
+@code{filter_arg_ptr}, thus getting ((x-2*shadow)/nparts+2*shadow,y)
+matrices. If nparts does not divide x-2*shadow, the last submatrix contains the
+remainder.
+
+IMPORTANT: This can only be used for read-only access, as no coherency is
+enforced for the shadowed parts.
+
+A usage example is available in examples/filters/shadow2d.c
+@end deftypefun
+
+@deftypefun void starpu_vertical_block_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+This partitions a dense Matrix along the y dimension, thus getting (x,y/nparts)
+matrices. If nparts does not divide y, the last submatrix contains the
+remainder.
+@end deftypefun
+
+@deftypefun void starpu_vertical_block_shadow_filter_func (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+This partitions a dense Matrix along the y dimension, with a shadow border
+@code{filter_arg_ptr}, thus getting (x,(y-2*shadow)/nparts+2*shadow)
+matrices. If nparts does not divide y-2*shadow, the last submatrix contains the
+remainder.
+
+IMPORTANT: This can only be used for read-only access, as no coherency is
+enforced for the shadowed parts.
+
+A usage example is available in examples/filters/shadow2d.c
+@end deftypefun
+
+@node Partitioning 3D Matrix Data
+@subsubsection Partitioning 3D Matrix Data
+
+A usage example is available in examples/filters/shadow3d.c
 
 @deftypefun void starpu_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
-This partitions a 3D matrix along the X axis.
+This partitions a 3D matrix along the X dimension, thus getting (x/nparts,y,z)
+3D matrices. If nparts does not divide x, the last submatrix contains the
+remainder.
+@end deftypefun
+
+@deftypefun void starpu_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+This partitions a 3D matrix along the X dimension, with a shadow border
+@code{filter_arg_ptr}, thus getting ((x-2*shadow)/nparts+2*shadow,y,z) 3D
+matrices. If nparts does not divide x, the last submatrix contains the
+remainder.
+
+IMPORTANT: This can only be used for read-only access, as no coherency is
+enforced for the shadowed parts.
+@end deftypefun
+
+@deftypefun void starpu_vertical_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+This partitions a 3D matrix along the Y dimension, thus getting (x,y/nparts,z)
+3D matrices. If nparts does not divide y, the last submatrix contains the
+remainder.
+@end deftypefun
+
+@deftypefun void starpu_vertical_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+This partitions a 3D matrix along the Y dimension, with a shadow border
+@code{filter_arg_ptr}, thus getting (x,(y-2*shadow)/nparts+2*shadow,z) 3D
+matrices. If nparts does not divide y, the last submatrix contains the
+remainder.
+
+IMPORTANT: This can only be used for read-only access, as no coherency is
+enforced for the shadowed parts.
+@end deftypefun
+
+@deftypefun void starpu_depth_block_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+This partitions a 3D matrix along the Z dimension, thus getting (x,y,z/nparts)
+3D matrices. If nparts does not divide z, the last submatrix contains the
+remainder.
+@end deftypefun
+
+@deftypefun void starpu_depth_block_shadow_filter_func_block (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+This partitions a 3D matrix along the Z dimension, with a shadow border
+@code{filter_arg_ptr}, thus getting (x,y,(z-2*shadow)/nparts+2*shadow)
+3D matrices. If nparts does not divide z, the last submatrix contains the
+remainder.
+
+IMPORTANT: This can only be used for read-only access, as no coherency is
+enforced for the shadowed parts.
+@end deftypefun
+
+@node Partitioning BCSR Data
+@subsubsection Partitioning BCSR Data
+
+@deftypefun void starpu_canonical_block_filter_bcsr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+This partitions a block-sparse matrix into dense matrices.
+@end deftypefun
+
+@deftypefun void starpu_vertical_block_filter_func_csr (void *@var{father_interface}, void *@var{child_interface}, {struct starpu_data_filter} *@var{f}, unsigned @var{id}, unsigned @var{nparts})
+This partitions a block-sparse matrix into vertical block-sparse matrices.
 @end deftypefun
 
 @node Codelets and Tasks
@@ -1891,7 +1965,7 @@ Used by @code{STARPU_HISTORY_BASED} and
 implementation number, and returns the size to be used as index for
 history and regression.
 
-@item @code{struct starpu_per_arch_perfmodel per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS]}
+@item @code{struct starpu_perfmodel_per_arch per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS]}
 Used by @code{STARPU_PER_ARCH}: array of @code{struct
 starpu_per_arch_perfmodel} structures.
 
@@ -1908,7 +1982,7 @@ Lock to protect concurrency between loading from disk (W), updating the values
 @end table
 @end deftp
 
-@deftp {Data Type} {struct starpu_regression_model}
+@deftp {Data Type} {struct starpu_perfmodel_regression_model}
 @table @asis
 @item @code{double sumlny} sum of ln(measured)
 @item @code{double sumlnx} sum of ln(size)
@@ -1925,7 +1999,7 @@ Lock to protect concurrency between loading from disk (W), updating the values
 @end table
 @end deftp
 
-@deftp {Data Type} {struct starpu_per_arch_perfmodel}
+@deftp {Data Type} {struct starpu_perfmodel_per_arch}
 contains information about the performance model of a given arch.
 
 @table @asis
@@ -1946,11 +2020,11 @@ case it depends on the architecture-specific implementation.
 @item @code{struct starpu_htbl32_node *history}
 The history of performance measurements.
 
-@item @code{struct starpu_history_list *list}
+@item @code{struct starpu_perfmodel_history_list *list}
 Used by @code{STARPU_HISTORY_BASED} and @code{STARPU_NL_REGRESSION_BASED},
 records all execution history measures.
 
-@item @code{struct starpu_regression_model regression}
+@item @code{struct starpu_perfmodel_regression_model regression}
 Used by @code{STARPU_HISTORY_REGRESION_BASED} and
 @code{STARPU_NL_REGRESSION_BASED}, contains the estimated factors of the
 regression.
@@ -1958,7 +2032,7 @@ regression.
 @end table
 @end deftp
 
-@deftypefun int starpu_load_history_debug ({const char} *@var{symbol}, {struct starpu_perfmodel} *@var{model})
+@deftypefun int starpu_perfmodel_load_symbol ({const char} *@var{symbol}, {struct starpu_perfmodel} *@var{model})
 loads a given performance model. The @var{model} structure has to be completely zero, and will be filled with the information saved in @code{~/.starpu}.
 @end deftypefun
 
@@ -1974,10 +2048,18 @@ returns the architecture name for @var{arch}.
 returns the architecture type of a given worker.
 @end deftypefun
 
-@deftypefun int starpu_list_models ({FILE *}@var{output})
+@deftypefun int starpu_perfmodel_list ({FILE *}@var{output})
 prints a list of all performance models on @var{output}.
 @end deftypefun
 
+@deftypefun void starpu_perfmodel_print ({struct starpu_perfmodel *}@var{model}, {enum starpu_perf_archtype} @var{arch}, unsigned @var{nimpl}, {char *}@var{parameter}, {uint32_t *}footprint, {FILE *}@var{output})
+todo
+@end deftypefun
+
+@deftypefun int starpu_perfmodel_print_all ({struct starpu_perfmodel *}@var{model}, {char *}@var{arch}, @var{char *}parameter,  {uint32_t *}@var{footprint}, {FILE *}@var{output})
+todo
+@end deftypefun
+
 @deftypefun void starpu_bus_print_bandwidth ({FILE *}@var{f})
 prints a matrix of bus bandwidths on @var{f}.
 @end deftypefun

+ 1 - 1
doc/chapters/perf-feedback.texi

@@ -383,7 +383,7 @@ performance models. It also writes a @code{.gp} file in the current directory,
 to be run in the @code{gnuplot} tool, which shows the corresponding curve.
 
 The same can also be achieved by using StarPU's library API, see
-@ref{Performance Model API} and notably the @code{starpu_load_history_debug}
+@ref{Performance Model API} and notably the @code{starpu_perfmodel_load_symbol}
 function. The source code of the @code{starpu_perfmodel_display} tool can be a
 useful example.
 

+ 2 - 0
examples/Makefile.am

@@ -182,6 +182,8 @@ examplebin_PROGRAMS +=				\
 	filters/fblock				\
 	filters/fmatrix				\
 	filters/shadow				\
+	filters/shadow2d			\
+	filters/shadow3d			\
 	tag_example/tag_example			\
 	tag_example/tag_example2		\
 	tag_example/tag_example3		\

+ 10 - 3
examples/audio/starpu_audio_processing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
@@ -179,9 +179,11 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 	{
 		cures = cufftPlan1d(&plans[workerid].plan, nsamples, CUFFT_R2C, 1);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plans[workerid].plan, starpu_cuda_get_local_stream());
 
 		cures = cufftPlan1d(&plans[workerid].inv_plan, nsamples, CUFFT_C2R, 1);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plans[workerid].inv_plan, starpu_cuda_get_local_stream());
 
 		cudaMalloc((void **)&plans[workerid].localout,
 					nsamples*sizeof(cufftComplex));
@@ -198,11 +200,11 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 	
 	/* filter low freqs */
 	unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
-	cudaMemset(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex));
+	cudaMemsetAsync(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex), starpu_cuda_get_local_stream());
 
 	/* filter high freqs */
 	unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
-	cudaMemset(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex));
+	cudaMemsetAsync(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex), starpu_cuda_get_local_stream());
 
 	/* inverse FFT */
 	cures = cufftExecC2R(plans[workerid].inv_plan, localout, localA);
@@ -210,6 +212,7 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 
 	/* FFTW does not normalize its output ! */
 	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -410,6 +413,8 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	starpu_helper_cublas_init();
+
 	starpu_vector_data_register(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
 
 	struct starpu_data_filter f =
@@ -458,6 +463,8 @@ int main(int argc, char **argv)
 	starpu_data_unpartition(A_handle, 0);
 	starpu_data_unregister(A_handle);
 
+	starpu_helper_cublas_shutdown();
+
 	/* we are done ! */
 	starpu_shutdown();
 

+ 2 - 1
examples/axpy/axpy.c

@@ -27,6 +27,7 @@
 
 #ifdef STARPU_USE_CUDA
 #include <cublas.h>
+#include <starpu_cuda.h>
 #endif
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
@@ -74,7 +75,7 @@ void axpy_gpu(void *descr[], __attribute__((unused)) void *arg)
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
 
 	CUBLASAXPY((int)n, alpha, block_x, 1, block_y, 1);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 2 - 2
examples/cholesky/cholesky_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -196,7 +196,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 				fprintf(stderr, "Error in Magma: %d\n", ret);
 				STARPU_ABORT();
 			}
-			cudaError_t cures = cudaThreadSynchronize();
+			cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			STARPU_ASSERT(!cures);
 			}
 #else

+ 26 - 8
examples/filters/shadow.c

@@ -16,9 +16,28 @@
  */
 
 /*
- * This examplifies the use of the shadow filter: a source vector of NX
- * elements (plus 2*SHADOW wrap-around elements) is copied into a destination
- * vector of NX+NPARTS*2*SHADOW elements, thus showing how shadowing shows up.
+ * This examplifies the use of the shadow filter: a source "vector" of NX
+ * elements (plus 2*SHADOW wrap-around elements) is partitioned into vectors
+ * with some shadowing, and these are copied into a destination "vector2" of
+ * NRPARTS*(NX/NPARTS+2*SHADOW) elements, partitioned in the traditionnal way,
+ * thus showing how shadowing shows up.
+ *
+ * For instance, with NX=8, SHADOW=1, and NPARTS=4:
+ *
+ * vector
+ * x0 x1 x2 x3 x4 x5 x6 x7 x8 x9
+ *
+ * is partitioned into 4 pieces:
+ *
+ * x0 x1 x2 x3
+ *       x2 x3 x4 x5
+ *             x4 x5 x6 x7
+ *                   x6 x7 x8 x9
+ *
+ * which are copied into the 4 destination subparts of vector2, thus getting in
+ * the end:
+ *
+ * x0 x1 x2 x3 x2 x3 x4 x5 x4 x5 x6 x7 x6 x7 x8 x9
  */
 
 #include <starpu.h>
@@ -66,7 +85,7 @@ void cuda_func(void *buffers[], void *cl_arg)
 
 	/* If things go right, sizes should match */
 	STARPU_ASSERT(n == n2);
-	cudaMemcpy(val2, val, n*sizeof(*val), cudaMemcpyDeviceToDevice);
+	cudaMemcpyAsync(val2, val, n*sizeof(*val), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
@@ -77,7 +96,6 @@ int main(int argc, char **argv)
         int vector[NX + 2*SHADOW];
         int vector2[NX + PARTS*2*SHADOW];
 	starpu_data_handle_t handle, handle2;
-        int factor=1;
 	int ret;
 
         struct starpu_codelet cl =
@@ -114,6 +132,9 @@ int main(int argc, char **argv)
 	starpu_vector_data_register(&handle2, 0, (uintptr_t)vector2, NX + PARTS*2*SHADOW, sizeof(vector[0]));
 
         /* Partition the source vector in PARTS sub-vectors with shadows */
+	/* NOTE: the resulting handles should only be used in read-only mode,
+	 * as StarPU will not know how the overlapping parts would have to be
+	 * combined. */
 	struct starpu_data_filter f =
 	{
 		.filter_func = starpu_block_shadow_filter_func_vector,
@@ -137,13 +158,10 @@ int main(int argc, char **argv)
                 starpu_data_handle_t sub_handle2 = starpu_data_get_sub_data(handle2, 1, i);
                 struct starpu_task *task = starpu_task_create();
 
-                factor *= 10;
 		task->handles[0] = sub_handle;
 		task->handles[1] = sub_handle2;
                 task->cl = &cl;
                 task->synchronous = 1;
-                task->cl_arg = &factor;
-                task->cl_arg_size = sizeof(factor);
 
 		ret = starpu_task_submit(task);
 		if (ret == -ENODEV) goto enodev;

+ 292 - 0
examples/filters/shadow2d.c

@@ -0,0 +1,292 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This examplifies the use of the matrix shadow filters: a source "matrix" of
+ * NX*NY elements (plus 2*NX*SHADOWX+2*NY*SHADOWY+4*SHADOWX*SHADOWY wrap-around
+ * elements) is partitioned into matrices with some shadowing, and these are
+ * copied into a destination "matrix2" of
+ * NRPARTSX*NPARTSY*((NX/NPARTSX+2*SHADOWX)*(NY/NPARTSY+2*SHADOWY)) elements,
+ * partitioned in the traditionnal way, thus showing how shadowing shows up.
+ *
+ * For instance, with NX=NY=8, SHADOWX=SHADOWY=1, and NPARTSX=NPARTSY=4:
+ *
+ * matrix
+ * 0123456789
+ * 1234567890
+ * 2345678901
+ * 3456789012
+ * 4567890123
+ * 5678901234
+ * 6789012345
+ * 7890123456
+ * 8901234567
+ * 9012345678
+ *
+ * is partitioned into 4*4 pieces:
+ *
+ * 0123 2345 4567 6789
+ * 1234 3456 5678 7890
+ * 2345 4567 6789 8901
+ * 3456 5678 7890 9012
+ *
+ * 2345 4567 6789 8901
+ * 3456 5678 7890 9012
+ * 4567 6789 8901 0123
+ * 5678 7890 9012 1234
+ *
+ * 4567 6789 8901 0123
+ * 5678 7890 9012 1234
+ * 6789 8901 0123 2345
+ * 7890 9012 1234 3456
+ *
+ * 6789 8901 0123 2345
+ * 7890 9012 1234 3456
+ * 8901 0123 2345 4567
+ * 9012 1234 3456 5678
+ *
+ * which are copied into the 4*4 destination subparts of matrix2, thus getting in
+ * the end:
+ *
+ * 0123234545676789
+ * 1234345656787890
+ * 2345456767898901
+ * 3456567878909012
+ * 2345456767898901
+ * 3456567878909012
+ * 4567678989010123
+ * 5678789090121234
+ * 4567678989010123
+ * 5678789090121234
+ * 6789890101232345
+ * 7890901212343456
+ * 6789890101232345
+ * 7890901212343456
+ * 8901012323454567
+ * 9012123434565678
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+
+/* Shadow width */
+#define SHADOWX 3
+#define SHADOWY 2
+#define NX    20
+#define NY    30
+#define PARTSX 2
+#define PARTSY 3
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+        /* length of the shadowed source matrix */
+        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
+        unsigned n = STARPU_MATRIX_GET_NX(buffers[0]);
+        unsigned m = STARPU_MATRIX_GET_NY(buffers[0]);
+        /* local copy of the shadowed source matrix pointer */
+        int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
+
+        /* length of the destination matrix */
+        unsigned ld2 = STARPU_MATRIX_GET_LD(buffers[1]);
+        unsigned n2 = STARPU_MATRIX_GET_NX(buffers[1]);
+        unsigned m2 = STARPU_MATRIX_GET_NY(buffers[1]);
+        /* local copy of the destination matrix pointer */
+        int *val2 = (int *)STARPU_MATRIX_GET_PTR(buffers[1]);
+
+	unsigned i, j;
+
+	/* If things go right, sizes should match */
+	STARPU_ASSERT(n == n2);
+	STARPU_ASSERT(m == m2);
+	for (j = 0; j < m; j++)
+		for (i = 0; i < n; i++)
+			val2[j*ld2+i] = val[j*ld+i];
+}
+
+#ifdef STARPU_USE_CUDA
+void cuda_func(void *buffers[], void *cl_arg)
+{
+        /* length of the shadowed source matrix */
+        unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
+        unsigned n = STARPU_MATRIX_GET_NX(buffers[0]);
+        unsigned m = STARPU_MATRIX_GET_NY(buffers[0]);
+        /* local copy of the shadowed source matrix pointer */
+        int *val = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
+
+        /* length of the destination matrix */
+        unsigned ld2 = STARPU_MATRIX_GET_LD(buffers[1]);
+        unsigned n2 = STARPU_MATRIX_GET_NX(buffers[1]);
+        unsigned m2 = STARPU_MATRIX_GET_NY(buffers[1]);
+        /* local copy of the destination matrix pointer */
+        int *val2 = (int *)STARPU_MATRIX_GET_PTR(buffers[1]);
+
+	/* If things go right, sizes should match */
+	STARPU_ASSERT(n == n2);
+	STARPU_ASSERT(m == m2);
+	cudaMemcpy2DAsync(val2, ld2*sizeof(*val2), val, ld*sizeof(*val), n*sizeof(*val), m, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+int main(int argc, char **argv)
+{
+	unsigned i, j, k, l;
+        int matrix[NY + 2*SHADOWY][NX + 2*SHADOWX];
+        int matrix2[NY + PARTSY*2*SHADOWY][NX + PARTSX*2*SHADOWX];
+	starpu_data_handle_t handle, handle2;
+	int ret;
+
+        struct starpu_codelet cl =
+	{
+                .where = STARPU_CPU
+#ifdef STARPU_USE_CUDA
+			|STARPU_CUDA
+#endif
+			,
+                .cpu_funcs = {cpu_func, NULL},
+#ifdef STARPU_USE_CUDA
+                .cuda_funcs = {cuda_func, NULL},
+#endif
+                .nbuffers = 2,
+		.modes = {STARPU_R, STARPU_W}
+        };
+
+	memset(matrix, -1, sizeof(matrix));
+	for(j=1 ; j<=NY ; j++)
+		for(i=1 ; i<=NX ; i++)
+			matrix[SHADOWY+j-1][SHADOWX+i-1] = i+j;
+
+	/* Copy borders */
+	for (j = SHADOWY ; j<SHADOWY+NY ; j++)
+		for(i=0 ; i<SHADOWX ; i++) {
+			matrix[j][i] = matrix[j][i+NX];
+			matrix[j][SHADOWX+NX+i] = matrix[j][SHADOWX+i];
+		}
+	for(j=0 ; j<SHADOWY ; j++)
+		for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
+			matrix[j][i] = matrix[j+NY][i];
+			matrix[SHADOWY+NY+j][i] = matrix[SHADOWY+j][i];
+		}
+	/* Copy corners */
+	for(j=0 ; j<SHADOWY ; j++)
+		for(i=0 ; i<SHADOWX ; i++) {
+			matrix[j][i] = matrix[j+NY][i+NX];
+			matrix[j][SHADOWX+NX+i] = matrix[j+NY][SHADOWX+i];
+			matrix[SHADOWY+NY+j][i] = matrix[SHADOWY+j][i+NX];
+			matrix[SHADOWY+NY+j][SHADOWX+NX+i] = matrix[SHADOWY+j][SHADOWX+i];
+		}
+
+        FPRINTF(stderr,"IN  Matrix:\n");
+	for(j=0 ; j<NY + 2*SHADOWY ; j++)
+	{
+		for(i=0 ; i<NX + 2*SHADOWX ; i++)
+			FPRINTF(stderr, "%5d ", matrix[j][i]);
+		FPRINTF(stderr,"\n");
+	}
+        FPRINTF(stderr,"\n");
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(77);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Declare source matrix to StarPU */
+	starpu_matrix_data_register(&handle, 0, (uintptr_t)matrix, NX + 2*SHADOWX, NX + 2*SHADOWX, NY + 2*SHADOWY, sizeof(matrix[0][0]));
+
+	/* Declare destination matrix to StarPU */
+	starpu_matrix_data_register(&handle2, 0, (uintptr_t)matrix2, NX + PARTSX*2*SHADOWX, NX + PARTSX*2*SHADOWX, NY + PARTSY*2*SHADOWY, sizeof(matrix2[0][0]));
+
+        /* Partition the source matrix in PARTSY*PARTSX sub-matrices with shadows */
+	/* NOTE: the resulting handles should only be used in read-only mode,
+	 * as StarPU will not know how the overlapping parts would have to be
+	 * combined. */
+	struct starpu_data_filter fy =
+	{
+		.filter_func = starpu_vertical_block_shadow_filter_func,
+		.nchildren = PARTSY,
+		.filter_arg_ptr = (void*)(uintptr_t) SHADOWY /* Shadow width */
+	};
+	struct starpu_data_filter fx =
+	{
+		.filter_func = starpu_block_shadow_filter_func,
+		.nchildren = PARTSX,
+		.filter_arg_ptr = (void*)(uintptr_t) SHADOWX /* Shadow width */
+	};
+	starpu_data_map_filters(handle, 2, &fy, &fx);
+
+        /* Partition the destination matrix in PARTSY*PARTSX sub-matrices */
+	struct starpu_data_filter fy2 =
+	{
+		.filter_func = starpu_vertical_block_filter_func,
+		.nchildren = PARTSY,
+	};
+	struct starpu_data_filter fx2 =
+	{
+		.filter_func = starpu_block_filter_func,
+		.nchildren = PARTSX,
+	};
+	starpu_data_map_filters(handle2, 2, &fy2, &fx2);
+
+        /* Submit a task on each sub-matrix */
+	for (j=0; j<PARTSY; j++)
+	{
+		for (i=0; i<PARTSX; i++)
+		{
+			starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 2, j, i);
+			starpu_data_handle_t sub_handle2 = starpu_data_get_sub_data(handle2, 2, j, i);
+			struct starpu_task *task = starpu_task_create();
+
+			task->handles[0] = sub_handle;
+			task->handles[1] = sub_handle2;
+			task->cl = &cl;
+			task->synchronous = 1;
+
+			ret = starpu_task_submit(task);
+			if (ret == -ENODEV) goto enodev;
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+		}
+	}
+
+	starpu_data_unpartition(handle, 0);
+	starpu_data_unpartition(handle2, 0);
+        starpu_data_unregister(handle);
+        starpu_data_unregister(handle2);
+	starpu_shutdown();
+
+        FPRINTF(stderr,"OUT Matrix:\n");
+	for(j=0 ; j<NY + PARTSY*2*SHADOWY ; j++)
+	{
+		for(i=0 ; i<NX + PARTSX*2*SHADOWX ; i++)
+			FPRINTF(stderr, "%5d ", matrix2[j][i]);
+		FPRINTF(stderr,"\n");
+	}
+        FPRINTF(stderr,"\n");
+	for(j=0 ; j<PARTSY ; j++)
+		for(i=0 ; i<PARTSX ; i++)
+			for (l=0 ; l<NY/PARTSY + 2*SHADOWY ; l++)
+				for (k=0 ; k<NX/PARTSX + 2*SHADOWX ; k++)
+					STARPU_ASSERT(matrix2[j*(NY/PARTSY+2*SHADOWY)+l][i*(NX/PARTSX+2*SHADOWX)+k] == matrix[j*(NY/PARTSY)+l][i*(NX/PARTSX)+k]);
+
+	return 0;
+
+enodev:
+	FPRINTF(stderr, "WARNING: No one can execute this task\n");
+	starpu_shutdown();
+	return 77;
+}

+ 332 - 0
examples/filters/shadow3d.c

@@ -0,0 +1,332 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This examplifies the use of the 3D matrix shadow filters: a source "matrix" of
+ * NX*NY*NZ elements (plus SHADOW wrap-around elements) is partitioned into
+ * matrices with some shadowing, and these are copied into a destination
+ * "matrix2" of
+ * NRPARTSX*NPARTSY*NPARTSZ*((NX/NPARTSX+2*SHADOWX)*(NY/NPARTSY+2*SHADOWY)*(NZ/NPARTSZ+2*SHADOWZ))
+ * elements, partitioned in the traditionnal way, thus showing how shadowing
+ * shows up.
+ */
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+
+/* Shadow width */
+#define SHADOWX 2
+#define SHADOWY 3
+#define SHADOWZ 4
+#define NX    12
+#define NY    9
+#define NZ    6
+#define PARTSX 4
+#define PARTSY 3
+#define PARTSZ 2
+
+#define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+        /* length of the shadowed source matrix */
+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
+        unsigned x = STARPU_BLOCK_GET_NX(buffers[0]);
+        unsigned y = STARPU_BLOCK_GET_NY(buffers[0]);
+        unsigned z = STARPU_BLOCK_GET_NZ(buffers[0]);
+        /* local copy of the shadowed source matrix pointer */
+        int *val = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
+
+        /* length of the destination matrix */
+        unsigned ldy2 = STARPU_BLOCK_GET_LDY(buffers[1]);
+        unsigned ldz2 = STARPU_BLOCK_GET_LDZ(buffers[1]);
+        unsigned x2 = STARPU_BLOCK_GET_NX(buffers[1]);
+        unsigned y2 = STARPU_BLOCK_GET_NY(buffers[1]);
+        unsigned z2 = STARPU_BLOCK_GET_NZ(buffers[1]);
+        /* local copy of the destination matrix pointer */
+        int *val2 = (int *)STARPU_BLOCK_GET_PTR(buffers[1]);
+
+	unsigned i, j, k;
+
+	/* If things go right, sizes should match */
+	STARPU_ASSERT(x == x2);
+	STARPU_ASSERT(y == y2);
+	STARPU_ASSERT(z == z2);
+	for (k = 0; k < z; k++)
+		for (j = 0; j < y; j++)
+			for (i = 0; i < x; i++)
+				val2[k*ldz2+j*ldy2+i] = val[k*ldz+j*ldy+i];
+}
+
+#ifdef STARPU_USE_CUDA
+void cuda_func(void *buffers[], void *cl_arg)
+{
+        /* length of the shadowed source matrix */
+        unsigned ldy = STARPU_BLOCK_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_BLOCK_GET_LDZ(buffers[0]);
+        unsigned x = STARPU_BLOCK_GET_NX(buffers[0]);
+        unsigned y = STARPU_BLOCK_GET_NY(buffers[0]);
+        unsigned z = STARPU_BLOCK_GET_NZ(buffers[0]);
+        /* local copy of the shadowed source matrix pointer */
+        int *val = (int *)STARPU_BLOCK_GET_PTR(buffers[0]);
+
+        /* length of the destination matrix */
+        unsigned ldy2 = STARPU_BLOCK_GET_LDY(buffers[1]);
+        unsigned ldz2 = STARPU_BLOCK_GET_LDZ(buffers[1]);
+        unsigned x2 = STARPU_BLOCK_GET_NX(buffers[1]);
+        unsigned y2 = STARPU_BLOCK_GET_NY(buffers[1]);
+        unsigned z2 = STARPU_BLOCK_GET_NZ(buffers[1]);
+        /* local copy of the destination matrix pointer */
+        int *val2 = (int *)STARPU_BLOCK_GET_PTR(buffers[1]);
+
+	unsigned k;
+	cudaError_t cures;
+
+	/* If things go right, sizes should match */
+	STARPU_ASSERT(x == x2);
+	STARPU_ASSERT(y == y2);
+	STARPU_ASSERT(z == z2);
+	for (k = 0; k < z; k++) {
+		cures = cudaMemcpy2DAsync(val2+k*ldz2, ldy2*sizeof(*val2), val+k*ldz, ldy*sizeof(*val),
+				x*sizeof(*val), y, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+		STARPU_ASSERT(!cures);
+	}
+	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
+	STARPU_ASSERT(!cures);
+}
+#endif
+
+int main(int argc, char **argv)
+{
+	unsigned i, j, k, l, m, n;
+        int matrix[NZ + 2*SHADOWZ][NY + 2*SHADOWY][NX + 2*SHADOWX];
+        int matrix2[NZ + PARTSZ*2*SHADOWZ][NY + PARTSY*2*SHADOWY][NX + PARTSX*2*SHADOWX];
+	starpu_data_handle_t handle, handle2;
+	int ret;
+
+        struct starpu_codelet cl =
+	{
+                .where = STARPU_CPU
+#ifdef STARPU_USE_CUDA
+			|STARPU_CUDA
+#endif
+			,
+                .cpu_funcs = {cpu_func, NULL},
+#ifdef STARPU_USE_CUDA
+                .cuda_funcs = {cuda_func, NULL},
+#endif
+                .nbuffers = 2,
+		.modes = {STARPU_R, STARPU_W}
+        };
+
+	memset(matrix, -1, sizeof(matrix));
+	for(k=1 ; k<=NZ ; k++)
+		for(j=1 ; j<=NY ; j++)
+			for(i=1 ; i<=NX ; i++)
+				matrix[SHADOWZ+k-1][SHADOWY+j-1][SHADOWX+i-1] = i+j+k;
+
+	/* Copy planes */
+	for (k = SHADOWZ ; k<SHADOWZ+NZ ; k++)
+		for (j = SHADOWY ; j<SHADOWY+NY ; j++)
+			for(i=0 ; i<SHADOWX ; i++) {
+				matrix[k][j][i] = matrix[k][j][i+NX];
+				matrix[k][j][SHADOWX+NX+i] = matrix[k][j][SHADOWX+i];
+			}
+	for(k=SHADOWZ ; k<SHADOWZ+NZ ; k++)
+		for(j=0 ; j<SHADOWY ; j++)
+			for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
+				matrix[k][j][i] = matrix[k][j+NY][i];
+				matrix[k][SHADOWY+NY+j][i] = matrix[k][SHADOWY+j][i];
+			}
+	for(k=0 ; k<SHADOWZ ; k++)
+		for(j=SHADOWY ; j<SHADOWY+NY ; j++)
+			for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
+				matrix[k][j][i] = matrix[k+NZ][j][i];
+				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j][i];
+			}
+
+	/* Copy borders */
+	for (k = SHADOWZ ; k<SHADOWZ+NZ ; k++)
+		for(j=0 ; j<SHADOWY ; j++)
+			for(i=0 ; i<SHADOWX ; i++) {
+				matrix[k][j][i] = matrix[k][j+NY][i+NX];
+				matrix[k][SHADOWY+NY+j][i] = matrix[k][SHADOWY+j][i+NX];
+				matrix[k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[k][SHADOWY+j][SHADOWX+i];
+				matrix[k][j][SHADOWX+NX+i] = matrix[k][j+NY][SHADOWX+i];
+			}
+	for(k=0 ; k<SHADOWZ ; k++)
+		for (j = SHADOWY ; j<SHADOWY+NY ; j++)
+			for(i=0 ; i<SHADOWX ; i++) {
+				matrix[k][j][i] = matrix[k+NZ][j][i+NX];
+				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j][i+NX];
+				matrix[SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[SHADOWZ+k][j][SHADOWX+i];
+				matrix[k][j][SHADOWX+NX+i] = matrix[k+NZ][j][SHADOWX+i];
+			}
+	for(k=0 ; k<SHADOWZ ; k++)
+		for(j=0 ; j<SHADOWY ; j++)
+			for(i=SHADOWX ; i<SHADOWX+NX ; i++) {
+				matrix[k][j][i] = matrix[k+NZ][j+NY][i];
+				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j+NY][i];
+				matrix[SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[SHADOWZ+k][SHADOWY+j][i];
+				matrix[k][SHADOWY+NY+j][i] = matrix[k+NZ][SHADOWY+j][i];
+			}
+
+	/* Copy corners */
+	for(k=0 ; k<SHADOWZ ; k++)
+		for(j=0 ; j<SHADOWY ; j++)
+			for(i=0 ; i<SHADOWX ; i++) {
+				matrix[k][j][i] = matrix[k+NZ][j+NY][i+NX];
+				matrix[k][j][SHADOWX+NX+i] = matrix[k+NZ][j+NY][SHADOWX+i];
+				matrix[k][SHADOWY+NY+j][i] = matrix[k+NZ][SHADOWY+j][i+NX];
+				matrix[k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[k+NZ][SHADOWY+j][SHADOWX+i];
+				matrix[SHADOWZ+NZ+k][j][i] = matrix[SHADOWZ+k][j+NY][i+NX];
+				matrix[SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[SHADOWZ+k][j+NY][SHADOWX+i];
+				matrix[SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[SHADOWZ+k][SHADOWY+j][i+NX];
+				matrix[SHADOWZ+NZ+k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[SHADOWZ+k][SHADOWY+j][SHADOWX+i];
+			}
+
+        FPRINTF(stderr,"IN  Matrix:\n");
+	for(k=0 ; k<NZ + 2*SHADOWZ ; k++)
+	{
+		for(j=0 ; j<NY + 2*SHADOWY ; j++)
+		{
+			for(i=0 ; i<NX + 2*SHADOWX ; i++)
+				FPRINTF(stderr, "%5d ", matrix[k][j][i]);
+			FPRINTF(stderr,"\n");
+		}
+		FPRINTF(stderr,"\n\n");
+	}
+        FPRINTF(stderr,"\n");
+
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		exit(77);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	/* Declare source matrix to StarPU */
+	starpu_block_data_register(&handle, 0, (uintptr_t)matrix,
+			NX + 2*SHADOWX, (NX + 2*SHADOWX) * (NY + 2*SHADOWY),
+			NX + 2*SHADOWX, NY + 2*SHADOWY, NZ + 2*SHADOWZ,
+			sizeof(matrix[0][0][0]));
+
+	/* Declare destination matrix to StarPU */
+	starpu_block_data_register(&handle2, 0, (uintptr_t)matrix2,
+			NX + PARTSX*2*SHADOWX, (NX + PARTSX*2*SHADOWX) * (NY + PARTSY*2*SHADOWY),
+			NX + PARTSX*2*SHADOWX, NY + PARTSY*2*SHADOWY, NZ + PARTSZ*2*SHADOWZ,
+			sizeof(matrix2[0][0][0]));
+
+        /* Partition the source matrix in PARTSZ*PARTSY*PARTSX sub-matrices with shadows */
+	/* NOTE: the resulting handles should only be used in read-only mode,
+	 * as StarPU will not know how the overlapping parts would have to be
+	 * combined. */
+	struct starpu_data_filter fz =
+	{
+		.filter_func = starpu_depth_block_shadow_filter_func_block,
+		.nchildren = PARTSZ,
+		.filter_arg_ptr = (void*)(uintptr_t) SHADOWZ /* Shadow width */
+	};
+	struct starpu_data_filter fy =
+	{
+		.filter_func = starpu_vertical_block_shadow_filter_func_block,
+		.nchildren = PARTSY,
+		.filter_arg_ptr = (void*)(uintptr_t) SHADOWY /* Shadow width */
+	};
+	struct starpu_data_filter fx =
+	{
+		.filter_func = starpu_block_shadow_filter_func_block,
+		.nchildren = PARTSX,
+		.filter_arg_ptr = (void*)(uintptr_t) SHADOWX /* Shadow width */
+	};
+	starpu_data_map_filters(handle, 3, &fz, &fy, &fx);
+
+        /* Partition the destination matrix in PARTSZ*PARTSY*PARTSX sub-matrices */
+	struct starpu_data_filter fz2 =
+	{
+		.filter_func = starpu_depth_block_filter_func_block,
+		.nchildren = PARTSZ,
+	};
+	struct starpu_data_filter fy2 =
+	{
+		.filter_func = starpu_vertical_block_filter_func_block,
+		.nchildren = PARTSY,
+	};
+	struct starpu_data_filter fx2 =
+	{
+		.filter_func = starpu_block_filter_func_block,
+		.nchildren = PARTSX,
+	};
+	starpu_data_map_filters(handle2, 3, &fz2, &fy2, &fx2);
+
+        /* Submit a task on each sub-matrix */
+	for (k=0; k<PARTSZ; k++)
+	{
+		for (j=0; j<PARTSY; j++)
+		{
+			for (i=0; i<PARTSX; i++)
+			{
+				starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 3, k, j, i);
+				starpu_data_handle_t sub_handle2 = starpu_data_get_sub_data(handle2, 3, k, j, i);
+				struct starpu_task *task = starpu_task_create();
+
+				task->handles[0] = sub_handle;
+				task->handles[1] = sub_handle2;
+				task->cl = &cl;
+				task->synchronous = 1;
+
+				ret = starpu_task_submit(task);
+				if (ret == -ENODEV) goto enodev;
+				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+			}
+		}
+	}
+
+	starpu_data_unpartition(handle, 0);
+	starpu_data_unpartition(handle2, 0);
+        starpu_data_unregister(handle);
+        starpu_data_unregister(handle2);
+	starpu_shutdown();
+
+        FPRINTF(stderr,"OUT Matrix:\n");
+	for(k=0 ; k<NZ + PARTSZ*2*SHADOWZ ; k++)
+	{
+		for(j=0 ; j<NY + PARTSY*2*SHADOWY ; j++)
+		{
+			for(i=0 ; i<NX + PARTSX*2*SHADOWX ; i++) {
+				FPRINTF(stderr, "%5d ", matrix2[k][j][i]);
+			}
+			FPRINTF(stderr,"\n");
+		}
+		FPRINTF(stderr,"\n\n");
+	}
+        FPRINTF(stderr,"\n");
+	for(k=0 ; k<PARTSZ ; k++)
+		for(j=0 ; j<PARTSY ; j++)
+			for(i=0 ; i<PARTSX ; i++)
+				for (n=0 ; n<NZ/PARTSZ + 2*SHADOWZ ; n++)
+					for (m=0 ; m<NY/PARTSY + 2*SHADOWY ; m++)
+						for (l=0 ; l<NX/PARTSX + 2*SHADOWX ; l++)
+							STARPU_ASSERT(matrix2[k*(NZ/PARTSZ+2*SHADOWZ)+n][j*(NY/PARTSY+2*SHADOWY)+m][i*(NX/PARTSX+2*SHADOWX)+l] ==
+									matrix[k*(NZ/PARTSZ)+n][j*(NY/PARTSY)+m][i*(NX/PARTSX)+l]);
+
+	return 0;
+
+enodev:
+	FPRINTF(stderr, "WARNING: No one can execute this task\n");
+	starpu_shutdown();
+	return 77;
+}

+ 5 - 1
examples/gl_interop/gl_interop.c

@@ -27,13 +27,17 @@
 #include <starpu.h>
 #include <unistd.h>
 #include <GL/glut.h>
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
 
 void dummy(void *buffers[], void *cl_arg)
 {
 	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
 
 	printf("Codelet running\n");
-	cudaMemset(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float));
+	cudaMemsetAsync(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	printf("Codelet done\n");
 }
 

+ 5 - 1
examples/gl_interop/gl_interop_idle.c

@@ -30,13 +30,17 @@
 #include <starpu.h>
 #include <unistd.h>
 #include <GL/glut.h>
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
 
 void dummy(void *buffers[], void *cl_arg)
 {
 	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
 
 	printf("Codelet running\n");
-	cudaMemset(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float));
+	cudaMemsetAsync(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	printf("Codelet done\n");
 }
 

+ 2 - 1
examples/heat/dw_factolu.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -28,6 +28,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cublas.h>
+#include <starpu_cuda.h>
 #endif
 
 #include "../common/blas.h"

+ 7 - 7
examples/heat/dw_factolu_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -137,7 +137,7 @@ static inline void dw_common_cpu_codelet_update_u22(void *descr[], int s, __attr
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ABORT();
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -202,7 +202,7 @@ static inline void dw_common_codelet_update_u12(void *descr[], int s, __attribut
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ABORT();
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -264,7 +264,7 @@ static inline void dw_common_codelet_update_u21(void *descr[], int s, __attribut
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ABORT();
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -344,8 +344,8 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribut
 			for (z = 0; z < nx; z++)
 			{
 				float pivot;
-				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
-				cudaStreamSynchronize(0);
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 				STARPU_ASSERT(pivot != 0.0f);
 				
@@ -357,7 +357,7 @@ static inline void dw_common_codelet_update_u11(void *descr[], int s, __attribut
 								&sub11[(z+1) + (z+1)*ld],ld);
 			}
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif

+ 1 - 0
examples/heat/heat.c

@@ -788,6 +788,7 @@ int main(int argc, char **argv)
 		if (check)
 			solve_system(DIM, newsize, result, RefArray, Bformer, A, B);
 
+		starpu_helper_cublas_shutdown();
 		starpu_shutdown();
 		free_system(A, B, newsize, pinned);
 	}

+ 13 - 13
examples/lu/xlu_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -70,7 +70,7 @@ static inline void STARPU_LU(common_u22)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_ABORT();
 
-			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 			break;
@@ -158,7 +158,7 @@ static inline void STARPU_LU(common_u12)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_ABORT();
 
-			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 			break;
@@ -243,7 +243,7 @@ static inline void STARPU_LU(common_u21)(void *descr[],
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ABORT();
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -328,8 +328,8 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 			{
 				TYPE pivot;
 				TYPE inv_pivot;
-				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
-				cudaStreamSynchronize(0);
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 				STARPU_ASSERT(fpclassify(pivot) != FP_ZERO);
 				
@@ -342,7 +342,7 @@ static inline void STARPU_LU(common_u11)(void *descr[],
 						(CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -450,8 +450,8 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 			{
 				TYPE pivot;
 				TYPE inv_pivot;
-				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
-				cudaStreamSynchronize(0);
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 				if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
 				{
@@ -466,8 +466,8 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 						CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
 					}
 
-					cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
-					cudaStreamSynchronize(0);
+					cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+					cudaStreamSynchronize(starpu_cuda_get_local_stream());
 				}
 
 				STARPU_ASSERT(pivot != 0.0);
@@ -482,7 +482,7 @@ static inline void STARPU_LU(common_u11_pivot)(void *descr[],
 				
 			}
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -570,7 +570,7 @@ static inline void STARPU_LU(common_pivot)(void *descr[],
 				}
 			}
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif

+ 8 - 6
examples/pi/pi_redux.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -235,8 +235,8 @@ static void init_cpu_func(void *descr[], void *cl_arg)
 static void init_cuda_func(void *descr[], void *cl_arg)
 {
         unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
-        cudaMemset(val, 0, sizeof(unsigned long));
-        cudaThreadSynchronize();
+        cudaMemsetAsync(val, 0, sizeof(unsigned long), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -263,12 +263,14 @@ static void redux_cuda_func(void *descr[], void *cl_arg)
 
 	unsigned long h_a, h_b;
 	
-	cudaMemcpy(&h_a, d_a, sizeof(h_a), cudaMemcpyDeviceToHost);
-	cudaMemcpy(&h_b, d_b, sizeof(h_b), cudaMemcpyDeviceToHost);
+	cudaMemcpyAsync(&h_a, d_a, sizeof(h_a), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaMemcpyAsync(&h_b, d_b, sizeof(h_b), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	h_a += h_b;
 
-	cudaMemcpy(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice);
+	cudaMemcpyAsync(d_a, &h_a, sizeof(h_a), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 };
 #endif
 

+ 7 - 11
examples/reductions/dot_product.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2012 inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -79,8 +79,8 @@ void init_cpu_func(void *descr[], void *cl_arg)
 void init_cuda_func(void *descr[], void *cl_arg)
 {
 	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	cudaMemset(dot, 0, sizeof(DOT_TYPE));
-	cudaThreadSynchronize();
+	cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -237,20 +237,16 @@ void dot_cuda_func(void *descr[], void *cl_arg)
 
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
-	cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
-
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 
 	local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
 
 	/* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	current_dot += local_dot;
 
-	cudaThreadSynchronize();
-
-	cudaMemcpy(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice);
-
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 7 - 7
gcc-plugin/examples/cholesky/cholesky_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -53,7 +53,7 @@ static inline void chol_common_cpu_codelet_update_u22(const float *left, const f
 			st = cublasGetError();
 			STARPU_ASSERT(!st);
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -99,7 +99,7 @@ static inline void chol_common_codelet_update_u21(const float *sub11, float *sub
 #ifdef STARPU_USE_CUDA
 		case 1:
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			break;
 #endif
 		default:
@@ -170,15 +170,15 @@ static inline void chol_common_codelet_update_u11(float *sub11, unsigned nx, uns
 					fprintf(stderr, "Error in Magma: %d\n", ret);
 					STARPU_ABORT();
 				}
-				cudaError_t cures = cudaThreadSynchronize();
+				cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 				STARPU_ASSERT(!cures);
 			}
 #else
 			for (z = 0; z < nx; z++)
 			{
 				float lambda11;
-				cudaMemcpy(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
-				cudaStreamSynchronize(0);
+				cudaMemcpyAsync(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 				STARPU_ASSERT(lambda11 != 0.0f);
 
@@ -193,7 +193,7 @@ static inline void chol_common_codelet_update_u11(float *sub11, unsigned nx, uns
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif
 			break;
 #endif

+ 9 - 1
include/starpu_data_filters.h

@@ -60,9 +60,12 @@ void starpu_data_vmap_filters(starpu_data_handle_t root_data, unsigned nfilters,
 /* for BCSR */
 void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
-/* (filters for BLAS interface) */
+
+/* (filters for matrix interface) */
 void starpu_block_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_block_shadow_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_vertical_block_shadow_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 /* for vector */
 void starpu_block_filter_func_vector(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
@@ -72,6 +75,11 @@ void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_i
 
 /* for block */
 void starpu_block_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_block_shadow_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_vertical_block_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_vertical_block_shadow_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_depth_block_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+void starpu_depth_block_shadow_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
 #ifdef __cplusplus
 }

+ 3 - 1
include/starpu_deprecated_api.h

@@ -39,11 +39,13 @@ typedef struct starpu_multiformat_interface starpu_multiformat_interface_t;
 #define starpu_buffer_descr_t starpu_buffer_descr
 #define starpu_history_list_t starpu_history_list
 #define starpu_regression_model_t starpu_regression_model
-#define starpu_per_arch_perfmodel_t starpu_per_arch_perfmodel
 #define starpu_perfmodel_t starpu_perfmodel
 #define starpu_sched_policy_s starpu_sched_policy
 #define starpu_data_interface_ops_t starpu_data_interface_ops
 
+#define starpu_per_arch_perfmodel_t starpu_per_arch_perfmodel
+#define starpu_per_arch_perfmodel starpu_perfmodel_per_arch
+
 typedef struct starpu_buffer_descr starpu_buffer_descr;
 typedef struct starpu_codelet starpu_codelet;
 typedef struct starpu_codelet starpu_codelet_t;

+ 17 - 21
include/starpu_perfmodel.h

@@ -34,9 +34,6 @@ extern "C"
 #endif
 
 struct starpu_task;
-
-struct starpu_history_table;
-struct starpu_history_list;
 struct starpu_buffer_descr;
 
 /*
@@ -72,7 +69,7 @@ _Static_assert(STARPU_CUDA_DEFAULT < STARPU_OPENCL_DEFAULT,
 
 #define STARPU_NARCH_VARIATIONS	(STARPU_GORDON_DEFAULT+1)
 
-struct starpu_history_entry
+struct starpu_perfmodel_history_entry
 {
 	//double measured;
 
@@ -108,19 +105,13 @@ struct starpu_history_entry
 #endif
 };
 
-struct starpu_history_list
-{
-	struct starpu_history_list *next;
-	struct starpu_history_entry *entry;
-};
-
-struct starpu_model_list
+struct starpu_perfmodel_history_list
 {
-	struct starpu_model_list *next;
-	struct starpu_perfmodel *model;
+	struct starpu_perfmodel_history_list *next;
+	struct starpu_perfmodel_history_entry *entry;
 };
 
-struct starpu_regression_model
+struct starpu_perfmodel_regression_model
 {
 	/* sum of ln(measured) */
 	double sumlny;
@@ -148,16 +139,18 @@ struct starpu_regression_model
 	unsigned nsample;
 };
 
-struct starpu_per_arch_perfmodel
+struct starpu_perfmodel_history_table;
+
+struct starpu_perfmodel_per_arch
 {
 	double (*cost_model)(struct starpu_buffer_descr *t) STARPU_DEPRECATED; /* returns expected duration in µs */
 	double (*cost_function)(struct starpu_task *task, enum starpu_perf_archtype arch, unsigned nimpl); /* returns expected duration in µs */
 	size_t (*size_base)(struct starpu_task *, enum starpu_perf_archtype arch, unsigned nimpl);
 
 	/* internal variables */
-	struct starpu_history_table *history;
-	struct starpu_history_list *list;
-	struct starpu_regression_model regression;
+	struct starpu_perfmodel_history_table *history;
+	struct starpu_perfmodel_history_list *list;
+	struct starpu_perfmodel_regression_model regression;
 #ifdef STARPU_MODEL_DEBUG
 	char debug_path[256];
 #endif
@@ -184,7 +177,7 @@ struct starpu_perfmodel
 	size_t (*size_base)(struct starpu_task *, unsigned nimpl);
 
 	/* per-architecture model */
-	struct starpu_per_arch_perfmodel per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
+	struct starpu_perfmodel_per_arch per_arch[STARPU_NARCH_VARIATIONS][STARPU_MAXIMPLEMENTATIONS];
 
 	/* Name of the performance model, this is used as a file name when saving history-based performance models */
 	const char *symbol;
@@ -204,11 +197,14 @@ enum starpu_perf_archtype starpu_worker_get_perf_archtype(int workerid);
 
 /* This function is intended to be used by external tools that should read the
  * performance model files */
-int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel *model);
+int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model);
 void starpu_perfmodel_debugfilepath(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, char *path, size_t maxlen, unsigned nimpl);
 void starpu_perfmodel_get_arch_name(enum starpu_perf_archtype arch, char *archname, size_t maxlen, unsigned nimpl);
-int starpu_list_models(FILE *output);
 double starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, uint32_t footprint);
+int starpu_perfmodel_list(FILE *output);
+void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output);
+int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output);
+
 void starpu_perfmodel_update_history(struct starpu_perfmodel *model, struct starpu_task *, enum starpu_perf_archtype arch, unsigned cpuid, unsigned nimpl, double measured);
 
 void starpu_bus_print_bandwidth(FILE *f);

+ 8 - 7
mpi/examples/cholesky/mpi_cholesky_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -22,6 +22,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cublas.h>
+#include <starpu_cuda.h>
 #ifdef STARPU_HAVE_MAGMA
 #include "magma.h"
 #include "magma_lapack.h"
@@ -65,7 +66,7 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __at
 			st = cublasGetError();
 			STARPU_ASSERT(!st);
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -114,7 +115,7 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, __attrib
 #ifdef STARPU_USE_CUDA
 		case 1:
 			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 			break;
 #endif
 		default:
@@ -188,15 +189,15 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 					fprintf(stderr, "Error in Magma: %d\n", ret);
 					STARPU_ABORT();
 				}
-				cudaError_t cures = cudaThreadSynchronize();
+				cudaError_t cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 				STARPU_ASSERT(!cures);
 			}
 #else
 			for (z = 0; z < nx; z++)
 			{
 				float lambda11;
-				cudaMemcpy(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
-				cudaStreamSynchronize(0);
+				cudaMemcpyAsync(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 				STARPU_ASSERT(lambda11 != 0.0f);
 
@@ -211,7 +212,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, __attrib
 							&sub11[(z+1)+(z+1)*ld], ld);
 			}
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif
 			break;
 #endif

+ 7 - 7
mpi/examples/mpi_lu/pxlu_kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -70,7 +70,7 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_ABORT();
 
-			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 			break;
@@ -175,7 +175,7 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 			if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
 				STARPU_ABORT();
 
-			if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
+			if (STARPU_UNLIKELY((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 			break;
@@ -282,7 +282,7 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 			if (status != CUBLAS_STATUS_SUCCESS)
 				STARPU_ABORT();
 
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif
@@ -381,8 +381,8 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 			for (z = 0; z < nx; z++)
 			{
 				TYPE pivot;
-				cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
-				cudaStreamSynchronize(0);
+				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+				cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 				STARPU_ASSERT(pivot != 0.0);
 				
@@ -394,7 +394,7 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 						&sub11[(z+1) + (z+1)*ld],ld);
 			}
 			
-			cudaThreadSynchronize();
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 			break;
 #endif

+ 4 - 1
mpi/examples/mpi_lu/pxlu_kernels.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -19,6 +19,9 @@
 #define __PXLU_KERNELS_H__
 
 #include <starpu.h>
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
 
 #define str(s) #s
 #define xstr(s)        str(s)

+ 1 - 0
src/Makefile.am

@@ -143,6 +143,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 						\
 	core/perfmodel/perfmodel_history.c			\
 	core/perfmodel/perfmodel_bus.c				\
 	core/perfmodel/perfmodel.c				\
+	core/perfmodel/perfmodel_print.c			\
 	core/perfmodel/regression.c				\
 	core/sched_policy.c					\
 	core/sched_ctx.c					\

+ 6 - 0
src/core/perfmodel/perfmodel.h

@@ -26,6 +26,12 @@
 #include <pthread.h>
 #include <stdio.h>
 
+struct _starpu_perfmodel_list
+{
+	struct _starpu_perfmodel_list *next;
+	struct starpu_perfmodel *model;
+};
+
 struct starpu_buffer_descr;
 struct _starpu_job;
 enum starpu_perf_archtype;

+ 12 - 1
src/core/perfmodel/perfmodel_bus.c

@@ -792,8 +792,10 @@ int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
 
 void starpu_bus_print_affinity(FILE *f)
 {
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	unsigned cpu;
 	int gpu;
+#endif
 
 	fprintf(f, "# GPU\tCPU in preference order (logical index)\n");
 
@@ -1108,7 +1110,7 @@ void starpu_bus_print_bandwidth(FILE *f)
         maxnode += nopencl;
 #endif
 
-	fprintf(f, "from to\t");
+	fprintf(f, "from/to\t");
 	fprintf(f, "RAM\t");
 	for (dst = 0; dst < ncuda; dst++)
 		fprintf(f, "CUDA %d\t", dst);
@@ -1130,6 +1132,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 		fprintf(f, "\n");
 	}
 
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 	fprintf(f, "\nGPU\tCPU in preference order (logical index), host-to-device, device-to-host\n");
 	for (src = 1; src <= maxnode; src++)
 	{
@@ -1138,6 +1141,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 		int ncpus = _starpu_topology_get_nhwcpu(config);
 		int cpu;
 
+#ifdef STARPU_USE_CUDA
 		if (src <= ncuda)
 		{
 			fprintf(f, "CUDA %d\t", src-1);
@@ -1150,7 +1154,11 @@ void starpu_bus_print_bandwidth(FILE *f)
 					fprintf(f, "%d\t", cuda_affinity_matrix[src-1][cpu]);
 			}
 		}
+#ifdef STARPU_USE_OPENCL
 		else
+#endif
+#endif
+#ifdef STARPU_USE_OPENCL
 		{
 			fprintf(f, "OpenCL%d\t", src-ncuda-1);
 			for (cpu = 0; cpu < ncpus; cpu++)
@@ -1162,8 +1170,10 @@ void starpu_bus_print_bandwidth(FILE *f)
 					fprintf(f, "%d\t", opencl_affinity_matrix[src-1][cpu]);
 			}
 		}
+#endif
 		fprintf(f, "\n");
 	}
+#endif
 }
 
 static void generate_bus_bandwidth_file(void)
@@ -1297,6 +1307,7 @@ static void generate_bus_config_file()
 
 static void starpu_force_bus_sampling(void)
 {
+	_STARPU_DEBUG("Force bus sampling ...\n");
 	_starpu_create_sampling_directory_if_needed();
 
 	generate_bus_affinity_file();

+ 45 - 45
src/core/perfmodel/perfmodel_history.c

@@ -39,11 +39,11 @@
 #define HASH_ADD_UINT32_T(head,field,add) HASH_ADD(hh,head,field,sizeof(uint32_t),add)
 #define HASH_FIND_UINT32_T(head,find,out) HASH_FIND(hh,head,find,sizeof(uint32_t),out)
 
-struct starpu_history_table
+struct starpu_perfmodel_history_table
 {
 	UT_hash_handle hh;
 	uint32_t footprint;
-	struct starpu_history_entry *history_entry;
+	struct starpu_perfmodel_history_entry *history_entry;
 };
 
 /* We want more than 10% variance on X to trust regression */
@@ -51,17 +51,17 @@ struct starpu_history_table
 	((reg_model)->minx < (9*(reg_model)->maxx)/10 && (reg_model)->nsample >= _STARPU_CALIBRATION_MINIMUM)
 
 static pthread_rwlock_t registered_models_rwlock;
-static struct starpu_model_list *registered_models = NULL;
+static struct _starpu_perfmodel_list *registered_models = NULL;
 
 /*
  * History based model
  */
-static void insert_history_entry(struct starpu_history_entry *entry, struct starpu_history_list **list, struct starpu_history_table **history_ptr)
+static void insert_history_entry(struct starpu_perfmodel_history_entry *entry, struct starpu_perfmodel_history_list **list, struct starpu_perfmodel_history_table **history_ptr)
 {
-	struct starpu_history_list *link;
-	struct starpu_history_table *table;
+	struct starpu_perfmodel_history_list *link;
+	struct starpu_perfmodel_history_table *table;
 
-	link = (struct starpu_history_list *) malloc(sizeof(struct starpu_history_list));
+	link = (struct starpu_perfmodel_history_list *) malloc(sizeof(struct starpu_perfmodel_history_list));
 	link->next = *list;
 	link->entry = entry;
 	*list = link;
@@ -70,7 +70,7 @@ static void insert_history_entry(struct starpu_history_entry *entry, struct star
 	//HASH_FIND_UINT32_T(*history_ptr, &entry->footprint, table);
 	//STARPU_ASSERT(table == NULL);
 
-	table = (struct starpu_history_table*) malloc(sizeof(*table));
+	table = (struct starpu_perfmodel_history_table*) malloc(sizeof(*table));
 	STARPU_ASSERT(table != NULL);
 	table->footprint = entry->footprint;
 	table->history_entry = entry;
@@ -79,10 +79,10 @@ static void insert_history_entry(struct starpu_history_entry *entry, struct star
 
 static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, unsigned arch, unsigned nimpl)
 {
-	struct starpu_per_arch_perfmodel *per_arch_model;
+	struct starpu_perfmodel_per_arch *per_arch_model;
 
 	per_arch_model = &model->per_arch[arch][nimpl];
-	struct starpu_regression_model *reg_model;
+	struct starpu_perfmodel_regression_model *reg_model;
 	reg_model = &per_arch_model->regression;
 
 	/*
@@ -116,7 +116,7 @@ static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, unsigned arc
 	fprintf(f, "%-15le\t%-15le\t%-15le\n", a, b, c);
 }
 
-static void scan_reg_model(FILE *f, struct starpu_regression_model *reg_model)
+static void scan_reg_model(FILE *f, struct starpu_perfmodel_regression_model *reg_model)
 {
 	int res;
 
@@ -151,12 +151,12 @@ static void scan_reg_model(FILE *f, struct starpu_regression_model *reg_model)
 	reg_model->nl_valid = !nl_invalid && VALID_REGRESSION(reg_model);
 }
 
-static void dump_history_entry(FILE *f, struct starpu_history_entry *entry)
+static void dump_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
 {
 	fprintf(f, "%08x\t%-15lu\t%-15le\t%-15le\t%-15le\t%-15le\t%u\n", entry->footprint, (unsigned long) entry->size, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
 }
 
-static void scan_history_entry(FILE *f, struct starpu_history_entry *entry)
+static void scan_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
 {
 	int res;
 
@@ -195,7 +195,7 @@ static void scan_history_entry(FILE *f, struct starpu_history_entry *entry)
 	}
 }
 
-static void parse_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel *per_arch_model, unsigned scan_history)
+static void parse_per_arch_model_file(FILE *f, struct starpu_perfmodel_per_arch *per_arch_model, unsigned scan_history)
 {
 	unsigned nentries;
 
@@ -210,10 +210,10 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel
 	unsigned i;
 	for (i = 0; i < nentries; i++)
 	{
-		struct starpu_history_entry *entry = NULL;
+		struct starpu_perfmodel_history_entry *entry = NULL;
 		if (scan_history)
 		{
-			entry = (struct starpu_history_entry *) malloc(sizeof(struct starpu_history_entry));
+			entry = (struct starpu_perfmodel_history_entry *) malloc(sizeof(struct starpu_perfmodel_history_entry));
 			STARPU_ASSERT(entry);
 		}
 
@@ -227,7 +227,7 @@ static void parse_per_arch_model_file(FILE *f, struct starpu_per_arch_perfmodel
 
 static void parse_arch(FILE *f, struct starpu_perfmodel *model, unsigned scan_history, unsigned archmin, unsigned archmax, unsigned skiparch)
 {
-	struct starpu_per_arch_perfmodel dummy;
+	struct starpu_perfmodel_per_arch dummy;
 	int nimpls, implmax, skipimpl, impl;
 	unsigned ret, arch;
 
@@ -351,11 +351,11 @@ static void parse_model_file(FILE *f, struct starpu_perfmodel *model, unsigned s
 
 static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, unsigned arch, unsigned nimpl)
 {
-	struct starpu_per_arch_perfmodel *per_arch_model;
+	struct starpu_perfmodel_per_arch *per_arch_model;
 
 	per_arch_model = &model->per_arch[arch][nimpl];
 	/* count the number of elements in the lists */
-	struct starpu_history_list *ptr = NULL;
+	struct starpu_perfmodel_history_list *ptr = NULL;
 	unsigned nentries = 0;
 
 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
@@ -394,10 +394,10 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, un
 
 static unsigned get_n_entries(struct starpu_perfmodel *model, unsigned arch, unsigned impl)
 {
-	struct starpu_per_arch_perfmodel *per_arch_model;
+	struct starpu_perfmodel_per_arch *per_arch_model;
 	per_arch_model = &model->per_arch[arch][impl];
 	/* count the number of elements in the lists */
-	struct starpu_history_list *ptr = NULL;
+	struct starpu_perfmodel_history_list *ptr = NULL;
 	unsigned nentries = 0;
 
 	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
@@ -537,7 +537,7 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 	}
 }
 
-static void initialize_per_arch_model(struct starpu_per_arch_perfmodel *per_arch_model)
+static void initialize_per_arch_model(struct starpu_perfmodel_per_arch *per_arch_model)
 {
 	per_arch_model->history = NULL;
 	per_arch_model->list = NULL;
@@ -603,7 +603,7 @@ int _starpu_register_model(struct starpu_perfmodel *model)
 	}
 
 	/* add the model to a linked list */
-	struct starpu_model_list *node = (struct starpu_model_list *) malloc(sizeof(struct starpu_model_list));
+	struct _starpu_perfmodel_list *node = (struct _starpu_perfmodel_list *) malloc(sizeof(struct _starpu_perfmodel_list));
 
 	node->model = model;
 	//model->debug_modelid = debug_modelid++;
@@ -673,7 +673,7 @@ static void _starpu_dump_registered_models(void)
 {
 	_STARPU_PTHREAD_RWLOCK_WRLOCK(&registered_models_rwlock);
 
-	struct starpu_model_list *node;
+	struct _starpu_perfmodel_list *node;
 	node = registered_models;
 
 	_STARPU_DEBUG("DUMP MODELS !\n");
@@ -701,7 +701,7 @@ void _starpu_deinitialize_registered_performance_models(void)
 
 	_STARPU_PTHREAD_RWLOCK_WRLOCK(&registered_models_rwlock);
 
-	struct starpu_model_list *node, *pnode;
+	struct _starpu_perfmodel_list *node, *pnode;
 	node = registered_models;
 
 	_STARPU_DEBUG("FREE MODELS !\n");
@@ -717,9 +717,9 @@ void _starpu_deinitialize_registered_performance_models(void)
 		{
 			for (nimpl = 0; nimpl < STARPU_MAXIMPLEMENTATIONS; nimpl++)
 			{
-				struct starpu_per_arch_perfmodel *archmodel = &model->per_arch[arch][nimpl];
-				struct starpu_history_list *list, *plist;
-				struct starpu_history_table *entry, *tmp;
+				struct starpu_perfmodel_per_arch *archmodel = &model->per_arch[arch][nimpl];
+				struct starpu_perfmodel_history_list *list, *plist;
+				struct starpu_perfmodel_history_table *entry, *tmp;
 
 				HASH_ITER(hh, archmodel->history, entry, tmp)
 				{
@@ -846,7 +846,7 @@ void _starpu_load_history_based_model(struct starpu_perfmodel *model, unsigned s
 
 /* This function is intended to be used by external tools that should read
  * the performance model files */
-int starpu_list_models(FILE *output)
+int starpu_perfmodel_list(FILE *output)
 {
         char path[256];
         DIR *dp;
@@ -876,7 +876,7 @@ int starpu_list_models(FILE *output)
 
 /* This function is intended to be used by external tools that should read the
  * performance model files */
-int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel *model)
+int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *model)
 {
 	model->symbol = strdup(symbol);
 	initialize_model(model);
@@ -899,7 +899,7 @@ int starpu_load_history_debug(const char *symbol, struct starpu_perfmodel *model
 			symbol2[dot-symbol] = '\0';
 			int ret;
 			fprintf(stderr,"note: loading history from %s instead of %s\n", symbol2, symbol);
-			ret = starpu_load_history_debug(symbol2,model);
+			ret = starpu_perfmodel_load_symbol(symbol2,model);
 			free(symbol2);
 			return ret;
 		}
@@ -970,7 +970,7 @@ double _starpu_regression_based_job_expected_perf(struct starpu_perfmodel *model
 {
 	double exp = NAN;
 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
-	struct starpu_regression_model *regmodel;
+	struct starpu_perfmodel_regression_model *regmodel;
 
 	regmodel = &model->per_arch[arch][nimpl].regression;
 
@@ -984,7 +984,7 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 {
 	double exp = NAN;
 	size_t size = _starpu_job_get_data_size(model, arch, nimpl, j);
-	struct starpu_regression_model *regmodel;
+	struct starpu_perfmodel_regression_model *regmodel;
 
 	regmodel = &model->per_arch[arch][nimpl].regression;
 
@@ -993,9 +993,9 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 	else
 	{
 		uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
-		struct starpu_per_arch_perfmodel *per_arch_model = &model->per_arch[arch][nimpl];
-		struct starpu_history_table *history;
-		struct starpu_history_table *entry;
+		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[arch][nimpl];
+		struct starpu_perfmodel_history_table *history;
+		struct starpu_perfmodel_history_table *entry;
 
 		_STARPU_PTHREAD_RWLOCK_RDLOCK(&model->model_rwlock);
 		history = per_arch_model->history;
@@ -1018,9 +1018,9 @@ double _starpu_non_linear_regression_based_job_expected_perf(struct starpu_perfm
 double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, struct _starpu_job *j,unsigned nimpl)
 {
 	double exp;
-	struct starpu_per_arch_perfmodel *per_arch_model;
-	struct starpu_history_entry *entry;
-	struct starpu_history_table *history, *elt;
+	struct starpu_perfmodel_per_arch *per_arch_model;
+	struct starpu_perfmodel_history_entry *entry;
+	struct starpu_perfmodel_history_table *history, *elt;
 
 	uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 
@@ -1077,13 +1077,13 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 	{
 		_STARPU_PTHREAD_RWLOCK_WRLOCK(&model->model_rwlock);
 
-		struct starpu_per_arch_perfmodel *per_arch_model = &model->per_arch[arch][nimpl];
+		struct starpu_perfmodel_per_arch *per_arch_model = &model->per_arch[arch][nimpl];
 
 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 		{
-			struct starpu_history_entry *entry;
-			struct starpu_history_table *elt;
-			struct starpu_history_list **list;
+			struct starpu_perfmodel_history_entry *entry;
+			struct starpu_perfmodel_history_table *elt;
+			struct starpu_perfmodel_history_list **list;
 			uint32_t key = _starpu_compute_buffers_footprint(model, arch, nimpl, j);
 
 			list = &per_arch_model->list;
@@ -1094,7 +1094,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 			if (!entry)
 			{
 				/* this is the first entry with such a footprint */
-				entry = (struct starpu_history_entry *) malloc(sizeof(struct starpu_history_entry));
+				entry = (struct starpu_perfmodel_history_entry *) malloc(sizeof(struct starpu_perfmodel_history_entry));
 				STARPU_ASSERT(entry);
 				entry->mean = measured;
 				entry->sum = measured;
@@ -1126,7 +1126,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
 		if (model->type == STARPU_REGRESSION_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 		{
-			struct starpu_regression_model *reg_model;
+			struct starpu_perfmodel_regression_model *reg_model;
 			reg_model = &per_arch_model->regression;
 
 			/* update the regression model */

+ 254 - 0
src/core/perfmodel/perfmodel_print.c

@@ -0,0 +1,254 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011  Télécom-SudParis
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_perfmodel.h>
+#include <common/config.h>
+
+static
+void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per_arch_model, char *parameter, uint32_t *footprint, FILE *output)
+{
+	struct starpu_perfmodel_history_list *ptr;
+
+	ptr = per_arch_model->list;
+
+	if (!parameter && ptr)
+		fprintf(output, "# hash\t\tsize\t\tmean\t\tstddev\t\tn\n");
+
+	while (ptr)
+	{
+		struct starpu_perfmodel_history_entry *entry = ptr->entry;
+		if (!footprint || entry->footprint == *footprint)
+		{
+			if (!parameter)
+			{
+				/* There isn't a parameter that is explicitely requested, so we display all parameters */
+				printf("%08x\t%-15lu\t%-15le\t%-15le\t%u\n", entry->footprint,
+					(unsigned long) entry->size, entry->mean, entry->deviation, entry->nsample);
+			}
+			else
+			{
+				/* only display the parameter that was specifically requested */
+				if (strcmp(parameter, "mean") == 0)
+				{
+					printf("%-15le\n", entry->mean);
+				}
+
+				if (strcmp(parameter, "stddev") == 0)
+				{
+					printf("%-15le\n", entry->deviation);
+					return;
+				}
+			}
+		}
+
+		ptr = ptr->next;
+	}
+}
+
+void starpu_perfmodel_print(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl, char *parameter, uint32_t *footprint, FILE *output)
+{
+	struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][nimpl];
+	char archname[32];
+
+	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list)
+	{
+		starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
+		fprintf(output, "performance model for %s\n", archname);
+	}
+
+	if (parameter == NULL)
+	{
+		/* no specific parameter was requested, so we display everything */
+		if (arch_model->regression.nsample)
+		{
+			fprintf(output, "\tRegression : #sample = %d\n", arch_model->regression.nsample);
+		}
+
+		/* Only display the regression model if we could actually build a model */
+		if (arch_model->regression.valid)
+		{
+			fprintf(output, "\tLinear: y = alpha size ^ beta\n");
+			fprintf(output, "\t\talpha = %e\n", arch_model->regression.alpha);
+			fprintf(output, "\t\tbeta = %e\n", arch_model->regression.beta);
+		}
+		else
+		{
+			//fprintf(output, "\tLinear model is INVALID\n");
+		}
+
+		if (arch_model->regression.nl_valid)
+		{
+			fprintf(output, "\tNon-Linear: y = a size ^b + c\n");
+			fprintf(output, "\t\ta = %e\n", arch_model->regression.a);
+			fprintf(output, "\t\tb = %e\n", arch_model->regression.b);
+			fprintf(output, "\t\tc = %e\n", arch_model->regression.c);
+		}
+		else
+		{
+			//fprintf(output, "\tNon-Linear model is INVALID\n");
+		}
+
+		_starpu_perfmodel_print_history_based(arch_model, parameter, footprint, output);
+
+#if 0
+		char debugname[1024];
+		starpu_perfmodel_debugfilepath(model, arch, debugname, 1024, nimpl);
+		printf("\t debug file path : %s\n", debugname);
+#endif
+	}
+	else
+	{
+		/* only display the parameter that was specifically requested */
+		if (strcmp(parameter, "a") == 0)
+		{
+			printf("%e\n", arch_model->regression.a);
+			return;
+		}
+
+		if (strcmp(parameter, "b") == 0)
+		{
+			printf("%e\n", arch_model->regression.b);
+			return;
+		}
+
+		if (strcmp(parameter, "c") == 0)
+		{
+			printf("%e\n", arch_model->regression.c);
+			return;
+		}
+
+		if (strcmp(parameter, "alpha") == 0)
+		{
+			printf("%e\n", arch_model->regression.alpha);
+			return;
+		}
+
+		if (strcmp(parameter, "beta") == 0)
+		{
+			printf("%e\n", arch_model->regression.beta);
+			return;
+		}
+
+		if (strcmp(parameter, "path-file-debug") == 0)
+		{
+			char debugname[256];
+			starpu_perfmodel_debugfilepath(model, arch, debugname, 1024, nimpl);
+			printf("%s\n", debugname);
+			return;
+		}
+
+		if ((strcmp(parameter, "mean") == 0) || (strcmp(parameter, "stddev") == 0))
+		{
+			_starpu_perfmodel_print_history_based(arch_model, parameter, footprint, output);
+			return;
+		}
+
+		/* TODO display if it's valid ? */
+
+		fprintf(output, "Unknown parameter requested, aborting.\n");
+		exit(-1);
+	}
+}
+
+int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char *parameter, uint32_t *footprint, FILE *output)
+{
+	if (arch == NULL)
+	{
+		/* display all architectures */
+		unsigned archid;
+		unsigned implid;
+		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
+		{
+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+			{ /* Display all codelets on each arch */
+				starpu_perfmodel_print(model, (enum starpu_perf_archtype) archid, implid, parameter, footprint, output);
+			}
+		}
+	}
+	else
+	{
+		if (strcmp(arch, "cpu") == 0)
+		{
+			unsigned implid;
+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+				starpu_perfmodel_print(model, STARPU_CPU_DEFAULT,implid, parameter, footprint, output); /* Display all codelets on cpu */
+			return 0;
+		}
+
+		int k;
+		if (sscanf(arch, "cpu:%d", &k) == 1)
+		{
+			/* For combined CPU workers */
+			if ((k < 1) || (k > STARPU_MAXCPUS))
+			{
+				fprintf(output, "Invalid CPU size\n");
+				exit(-1);
+			}
+
+			unsigned implid;
+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+				starpu_perfmodel_print(model, (enum starpu_perf_archtype) (STARPU_CPU_DEFAULT + k - 1), implid, parameter, footprint, output);
+			return 0;
+		}
+
+		if (strcmp(arch, "cuda") == 0)
+		{
+			unsigned archid;
+			unsigned implid;
+			for (archid = STARPU_CUDA_DEFAULT; archid < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS; archid++)
+			{
+				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
+				{
+					char archname[32];
+					starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) archid, archname, 32, implid);
+					fprintf(output, "performance model for %s\n", archname);
+					starpu_perfmodel_print(model, (enum starpu_perf_archtype) archid, implid, parameter, footprint, output);
+				}
+			}
+			return 0;
+		}
+
+		/* There must be a cleaner way ! */
+		int gpuid;
+		int nmatched;
+		nmatched = sscanf(arch, "cuda_%d", &gpuid);
+		if (nmatched == 1)
+		{
+			unsigned archid = STARPU_CUDA_DEFAULT+ gpuid;
+			unsigned implid;
+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+				starpu_perfmodel_print(model, (enum starpu_perf_archtype) archid, implid, parameter, footprint, output);
+			return 0;
+		}
+
+		if (strcmp(arch, "gordon") == 0)
+		{
+			fprintf(output, "performance model for gordon\n");
+			unsigned implid;
+			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
+				starpu_perfmodel_print(model, STARPU_GORDON_DEFAULT, implid, parameter, footprint, output);
+			return 0;
+		}
+
+		fprintf(output, "Unknown architecture requested\n");
+		return -1;
+	}
+	return 0;
+}
+

+ 5 - 5
src/core/perfmodel/regression.c

@@ -112,11 +112,11 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 	return r;
 }
 
-static unsigned find_list_size(struct starpu_history_list *list_history)
+static unsigned find_list_size(struct starpu_perfmodel_history_list *list_history)
 {
 	unsigned cnt = 0;
 
-	struct starpu_history_list *ptr = list_history;
+	struct starpu_perfmodel_history_list *ptr = list_history;
 	while (ptr)
 	{
 		cnt++;
@@ -139,9 +139,9 @@ static double find_list_min(double *y, unsigned n)
 	return min;
 }
 
-static void dump_list(unsigned *x, double *y, struct starpu_history_list *list_history)
+static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_list *list_history)
 {
-	struct starpu_history_list *ptr = list_history;
+	struct starpu_perfmodel_history_list *ptr = list_history;
 	unsigned i = 0;
 
 	while (ptr)
@@ -159,7 +159,7 @@ static void dump_list(unsigned *x, double *y, struct starpu_history_list *list_h
  * 	return 0 if success, -1 otherwise
  * 	if success, a, b and c are modified
  * */
-int _starpu_regression_non_linear_power(struct starpu_history_list *ptr, double *a, double *b, double *c)
+int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *ptr, double *a, double *b, double *c)
 {
 	unsigned n = find_list_size(ptr);
 

+ 1 - 1
src/core/perfmodel/regression.h

@@ -24,6 +24,6 @@
 #include <core/perfmodel/perfmodel.h>
 #include <starpu.h>
 
-int _starpu_regression_non_linear_power(struct starpu_history_list *ptr, double *a, double *b, double *c);
+int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *ptr, double *a, double *b, double *c);
 
 #endif // __REGRESSION_H__ 

+ 18 - 4
src/core/sched_policy.c

@@ -416,8 +416,6 @@ struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
 	handle->busy_count++;
 	_starpu_spin_unlock(&handle->header_lock);
 
-	struct starpu_multiformat_data_interface_ops *mf_ops;
-	mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
 	switch(node_kind)
 	{
 	case STARPU_CPU_RAM:
@@ -427,13 +425,21 @@ struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
 			STARPU_ASSERT(0);
 #ifdef STARPU_USE_CUDA
 		case STARPU_CUDA_RAM:
+		{
+			struct starpu_multiformat_data_interface_ops *mf_ops;
+			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
 			conversion_task->cl = mf_ops->cuda_to_cpu_cl;
 			break;
+		}
 #endif
 #ifdef STARPU_USE_OPENCL
 		case STARPU_OPENCL_RAM:
+		{
+			struct starpu_multiformat_data_interface_ops *mf_ops;
+			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
 			conversion_task->cl = mf_ops->opencl_to_cpu_cl;
 			break;
+		}
 #endif
 		default:
 			fprintf(stderr, "Oops : %u\n", handle->mf_node);
@@ -442,13 +448,21 @@ struct starpu_task *_starpu_create_conversion_task(starpu_data_handle_t handle,
 		break;
 #ifdef STARPU_USE_CUDA
 	case STARPU_CUDA_RAM:
-		conversion_task->cl = mf_ops->cpu_to_cuda_cl;
-		break;
+		{
+			struct starpu_multiformat_data_interface_ops *mf_ops;
+			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
+			conversion_task->cl = mf_ops->cpu_to_cuda_cl;
+			break;
+		}
 #endif
 #ifdef STARPU_USE_OPENCL
 	case STARPU_OPENCL_RAM:
+	{
+		struct starpu_multiformat_data_interface_ops *mf_ops;
+		mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
 		conversion_task->cl = mf_ops->cpu_to_opencl_cl;
 		break;
+	}
 #endif
 	case STARPU_SPU_LS: /* Not supported */
 	default:

+ 4 - 6
src/core/topology.c

@@ -838,17 +838,15 @@ static void _starpu_init_workers_binding(struct _starpu_machine_config *config)
 #endif /* __GLIBC__ */
 
 #ifdef STARPU_HAVE_HWLOC
-		/* Clear the cpu set and set the cpu */
-		workerarg->initial_hwloc_cpu_set = hwloc_bitmap_alloc();
-		hwloc_bitmap_only(workerarg->initial_hwloc_cpu_set, workerarg->bindid);
-		workerarg->current_hwloc_cpu_set = hwloc_bitmap_alloc();
-		hwloc_bitmap_only(workerarg->current_hwloc_cpu_set, workerarg->bindid);
-
 		/* Put the worker descriptor in the userdata field of the hwloc object describing the CPU */
 		hwloc_obj_t worker_obj;
 		worker_obj = hwloc_get_obj_by_depth(config->topology.hwtopology,
 					config->cpu_depth, workerarg->bindid);
 		worker_obj->userdata = &config->workers[worker];
+
+		/* Clear the cpu set and set the cpu */
+		workerarg->initial_hwloc_cpu_set = hwloc_bitmap_dup(worker_obj->cpuset);
+		workerarg->current_hwloc_cpu_set = hwloc_bitmap_dup(worker_obj->cpuset);
 #endif
 	}
 }

+ 182 - 2
src/datawizard/interfaces/block_filters.c

@@ -41,9 +41,189 @@ void starpu_block_filter_func_block(void *father_interface, void *child_interfac
 	block_child->nz = nz;
 	block_child->elemsize = elemsize;
 
-	if (block_father->ptr)
+	if (block_father->dev_handle)
 	{
-                block_child->ptr = block_father->ptr + offset;
+		if (block_father->ptr)
+                	block_child->ptr = block_father->ptr + offset;
+                block_child->ldy = block_father->ldy;
+                block_child->ldz = block_father->ldz;
+                block_child->dev_handle = block_father->dev_handle;
+                block_child->offset = block_father->offset + offset;
+	}
+}
+
+void starpu_block_shadow_filter_func_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
+                                    unsigned id, unsigned nparts)
+{
+        struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
+        struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
+
+        uintptr_t shadow_size = (uintptr_t) f->filter_arg_ptr;
+
+	/* actual number of elements */
+	uint32_t nx = block_father->nx - 2 * shadow_size;
+        uint32_t ny = block_father->ny;
+        uint32_t nz = block_father->nz;
+	size_t elemsize = block_father->elemsize;
+
+	STARPU_ASSERT(nparts <= nx);
+
+	uint32_t chunk_size = (nx + nparts - 1)/nparts;
+	size_t offset = id*chunk_size*elemsize;
+
+        uint32_t child_nx = STARPU_MIN(chunk_size, nx - id*chunk_size) + 2 * shadow_size;
+
+	block_child->nx = child_nx;
+	block_child->ny = ny;
+	block_child->nz = nz;
+	block_child->elemsize = elemsize;
+
+	if (block_father->dev_handle)
+	{
+		if (block_father->ptr)
+                	block_child->ptr = block_father->ptr + offset;
+                block_child->ldy = block_father->ldy;
+                block_child->ldz = block_father->ldz;
+                block_child->dev_handle = block_father->dev_handle;
+                block_child->offset = block_father->offset + offset;
+	}
+}
+
+void starpu_vertical_block_filter_func_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
+                                    unsigned id, unsigned nparts)
+{
+        struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
+        struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
+
+	uint32_t nx = block_father->nx;
+        uint32_t ny = block_father->ny;
+        uint32_t nz = block_father->nz;
+	size_t elemsize = block_father->elemsize;
+
+	STARPU_ASSERT(nparts <= ny);
+
+	uint32_t chunk_size = (ny + nparts - 1)/nparts;
+	size_t offset = id*chunk_size*block_father->ldy*elemsize;
+
+        uint32_t child_ny = STARPU_MIN(chunk_size, ny - id*chunk_size);
+
+	block_child->nx = nx;
+	block_child->ny = child_ny;
+	block_child->nz = nz;
+	block_child->elemsize = elemsize;
+
+	if (block_father->dev_handle)
+	{
+		if (block_father->ptr)
+                	block_child->ptr = block_father->ptr + offset;
+                block_child->ldy = block_father->ldy;
+                block_child->ldz = block_father->ldz;
+                block_child->dev_handle = block_father->dev_handle;
+                block_child->offset = block_father->offset + offset;
+	}
+}
+
+void starpu_vertical_block_shadow_filter_func_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
+                                    unsigned id, unsigned nparts)
+{
+        struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
+        struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
+
+        uintptr_t shadow_size = (uintptr_t) f->filter_arg_ptr;
+
+	uint32_t nx = block_father->nx;
+	/* actual number of elements */
+        uint32_t ny = block_father->ny - 2 * shadow_size;
+        uint32_t nz = block_father->nz;
+	size_t elemsize = block_father->elemsize;
+
+	STARPU_ASSERT(nparts <= ny);
+
+	uint32_t chunk_size = (ny + nparts - 1)/nparts;
+	size_t offset = id*chunk_size*block_father->ldy*elemsize;
+
+        uint32_t child_ny = STARPU_MIN(chunk_size, ny - id*chunk_size) + 2 * shadow_size;
+
+	block_child->nx = nx;
+	block_child->ny = child_ny;
+	block_child->nz = nz;
+	block_child->elemsize = elemsize;
+
+	if (block_father->dev_handle)
+	{
+		if (block_father->ptr)
+                	block_child->ptr = block_father->ptr + offset;
+                block_child->ldy = block_father->ldy;
+                block_child->ldz = block_father->ldz;
+                block_child->dev_handle = block_father->dev_handle;
+                block_child->offset = block_father->offset + offset;
+	}
+}
+
+void starpu_depth_block_filter_func_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
+                                    unsigned id, unsigned nparts)
+{
+        struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
+        struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
+
+	uint32_t nx = block_father->nx;
+        uint32_t ny = block_father->ny;
+        uint32_t nz = block_father->nz;
+	size_t elemsize = block_father->elemsize;
+
+	STARPU_ASSERT(nparts <= nz);
+
+	uint32_t chunk_size = (nz + nparts - 1)/nparts;
+	size_t offset = id*chunk_size*block_father->ldz*elemsize;
+
+        uint32_t child_nz = STARPU_MIN(chunk_size, nz - id*chunk_size);
+
+	block_child->nx = nx;
+	block_child->ny = ny;
+	block_child->nz = child_nz;
+	block_child->elemsize = elemsize;
+
+	if (block_father->dev_handle)
+	{
+		if (block_father->ptr)
+                	block_child->ptr = block_father->ptr + offset;
+                block_child->ldy = block_father->ldy;
+                block_child->ldz = block_father->ldz;
+                block_child->dev_handle = block_father->dev_handle;
+                block_child->offset = block_father->offset + offset;
+	}
+}
+
+void starpu_depth_block_shadow_filter_func_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f,
+                                    unsigned id, unsigned nparts)
+{
+        struct starpu_block_interface *block_father = (struct starpu_block_interface *) father_interface;
+        struct starpu_block_interface *block_child = (struct starpu_block_interface *) child_interface;
+
+        uintptr_t shadow_size = (uintptr_t) f->filter_arg_ptr;
+
+	uint32_t nx = block_father->nx;
+        uint32_t ny = block_father->ny;
+	/* actual number of elements */
+        uint32_t nz = block_father->nz - 2 * shadow_size;
+	size_t elemsize = block_father->elemsize;
+
+	STARPU_ASSERT(nparts <= nz);
+
+	uint32_t chunk_size = (nz + nparts - 1)/nparts;
+	size_t offset = id*chunk_size*block_father->ldz*elemsize;
+
+        uint32_t child_nz = STARPU_MIN(chunk_size, nz - id*chunk_size) + 2 * shadow_size;
+
+	block_child->nx = nx;
+	block_child->ny = ny;
+	block_child->nz = child_nz;
+	block_child->elemsize = elemsize;
+
+	if (block_father->dev_handle)
+	{
+		if (block_father->ptr)
+                	block_child->ptr = block_father->ptr + offset;
                 block_child->ldy = block_father->ldy;
                 block_child->ldz = block_father->ldz;
                 block_child->dev_handle = block_father->dev_handle;

+ 8 - 2
src/datawizard/interfaces/data_interface.c

@@ -510,19 +510,25 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 			struct starpu_codelet *cl = NULL;
 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
 
-			struct starpu_multiformat_data_interface_ops *mf_ops;
-			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
 			switch (node_kind)
 			{
 #ifdef STARPU_USE_CUDA
 				case STARPU_CUDA_RAM:
+				{
+					struct starpu_multiformat_data_interface_ops *mf_ops;
+					mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
 					cl = mf_ops->cuda_to_cpu_cl;
 					break;
+				}
 #endif
 #ifdef STARPU_USE_OPENCL
 				case STARPU_OPENCL_RAM:
+				{
+					struct starpu_multiformat_data_interface_ops *mf_ops;
+					mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
 					cl = mf_ops->opencl_to_cpu_cl;
 					break;
+				}
 #endif
 				case STARPU_CPU_RAM:      /* Impossible ! */
 				case STARPU_SPU_LS:       /* Not supported */

+ 74 - 1
src/datawizard/interfaces/matrix_filters.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
@@ -56,6 +56,45 @@ void starpu_block_filter_func(void *father_interface, void *child_interface, STA
 	}
 }
 
+/*
+ * an example of a dummy partition function : blocks ...
+ */
+void starpu_block_shadow_filter_func(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
+{
+	struct starpu_matrix_interface *matrix_father = (struct starpu_matrix_interface *) father_interface;
+	struct starpu_matrix_interface *matrix_child = (struct starpu_matrix_interface *) child_interface;
+
+	uintptr_t shadow_size = (uintptr_t) f->filter_arg_ptr;
+
+	/* actual number of elements */
+	uint32_t nx = matrix_father->nx - 2 * shadow_size;
+	uint32_t ny = matrix_father->ny;
+	size_t elemsize = matrix_father->elemsize;
+
+	STARPU_ASSERT(nchunks <= nx);
+
+	size_t chunk_size = ((size_t)nx + nchunks - 1)/nchunks;
+	size_t offset = (size_t)id*chunk_size*elemsize;
+
+	uint32_t child_nx =
+	  STARPU_MIN(chunk_size, (size_t)nx - (size_t)id*chunk_size) + 2 * shadow_size;
+
+	/* update the child's interface */
+	matrix_child->nx = child_nx;
+	matrix_child->ny = ny;
+	matrix_child->elemsize = elemsize;
+
+	/* is the information on this node valid ? */
+	if (matrix_father->dev_handle)
+	{
+		if (matrix_father->ptr)
+			matrix_child->ptr = matrix_father->ptr + offset;
+		matrix_child->ld = matrix_father->ld;
+		matrix_child->dev_handle = matrix_father->dev_handle;
+		matrix_child->offset = matrix_father->offset + offset;
+	}
+}
+
 void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
 {
         struct starpu_matrix_interface *matrix_father = (struct starpu_matrix_interface *) father_interface;
@@ -86,3 +125,37 @@ void starpu_vertical_block_filter_func(void *father_interface, void *child_inter
 		matrix_child->offset = matrix_father->offset + offset;
 	}
 }
+
+void starpu_vertical_block_shadow_filter_func(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
+{
+        struct starpu_matrix_interface *matrix_father = (struct starpu_matrix_interface *) father_interface;
+        struct starpu_matrix_interface *matrix_child = (struct starpu_matrix_interface *) child_interface;
+
+	uintptr_t shadow_size = (uintptr_t) f->filter_arg_ptr;
+
+	uint32_t nx = matrix_father->nx;
+	/* actual number of elements */
+	uint32_t ny = matrix_father->ny - 2 * shadow_size;
+	size_t elemsize = matrix_father->elemsize;
+
+	STARPU_ASSERT(nchunks <= ny);
+
+	size_t chunk_size = ((size_t)ny + nchunks - 1)/nchunks;
+	size_t child_ny =
+	  STARPU_MIN(chunk_size, (size_t)ny - (size_t)id*chunk_size) + 2 * shadow_size;
+
+	matrix_child->nx = nx;
+	matrix_child->ny = child_ny;
+	matrix_child->elemsize = elemsize;
+
+	/* is the information on this node valid ? */
+	if (matrix_father->dev_handle)
+	{
+		size_t offset = (size_t)id*chunk_size*matrix_father->ld*elemsize;
+		if (matrix_father->ptr)
+			matrix_child->ptr = matrix_father->ptr + offset;
+		matrix_child->ld = matrix_father->ld;
+		matrix_child->dev_handle = matrix_father->dev_handle;
+		matrix_child->offset = matrix_father->offset + offset;
+	}
+}

+ 1 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -513,7 +513,7 @@ static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUT
 	cures = cudaMemcpy3DPeerAsync(&p, stream);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
-	cudaThreadSynchronize();
+	cudaStreamSynchronize(stream);
 
 	if (is_async)
 	{

+ 3 - 0
src/drivers/cpu/driver_cpu.c

@@ -127,6 +127,9 @@ int _starpu_cpu_driver_init(struct starpu_driver *d)
 	_starpu_bind_thread_on_cpu(cpu_worker->config, cpu_worker->bindid);
 
         _STARPU_DEBUG("cpu worker %d is ready on logical cpu %d\n", devid, cpu_worker->bindid);
+#ifdef STARPU_HAVE_HWLOC
+	_STARPU_DEBUG("cpu worker %d cpuset start at %d\n", devid, hwloc_bitmap_first(cpu_worker->initial_hwloc_cpu_set));
+#endif
 
 	_starpu_set_local_memory_node_key(&cpu_worker->memory_node);
 

+ 11 - 0
src/drivers/cuda/driver_cuda.c

@@ -28,7 +28,9 @@
 #include <drivers/driver_common/driver_common.h>
 #include "driver_cuda.h"
 #include <core/sched_policy.h>
+#ifdef HAVE_CUDA_GL_INTEROP_H
 #include <cuda_gl_interop.h>
+#endif
 
 /* the number of CUDA devices */
 static int ncudagpus;
@@ -115,13 +117,20 @@ void starpu_cuda_set_device(int devid)
 {
 	cudaError_t cures;
 	struct starpu_conf *conf = _starpu_get_machine_config()->conf;
+#if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
 	unsigned i;
+#endif
 
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	if (conf->n_cuda_opengl_interoperability) {
 		fprintf(stderr, "OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
 		STARPU_ABORT();
 	}
+#elif !defined(HAVE_CUDA_GL_INTEROP_H)
+	if (conf->n_cuda_opengl_interoperability) {
+		fprintf(stderr,"OpenGL interoperability was requested, but cuda_gl_interop.h could not be compiled, please make sure that OpenGL headers were available before ./configure run.");
+		STARPU_ABORT();
+	}
 #else
 	for (i = 0; i < conf->n_cuda_opengl_interoperability; i++)
 		if (conf->cuda_opengl_interoperability[i] == devid) {
@@ -132,7 +141,9 @@ void starpu_cuda_set_device(int devid)
 
 	cures = cudaSetDevice(devid);
 
+#if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
 done:
+#endif
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 }

+ 1 - 1
src/sched_policies/eager_central_policy.c

@@ -55,7 +55,7 @@ static void eager_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned
 
 static void initialize_eager_center_policy(unsigned sched_ctx_id) 
 {
-	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: you are running the default eager scheduler, which does not include optimizations. Make sure to read the StarPU documentation about adding performance models in order to be able to use the heft or dmda schedulers instead.\n");
+	if (!getenv("STARPU_SILENT")) fprintf(stderr,"Warning: you are running the default eager scheduler, which is not very smart. Make sure to read the StarPU documentation about adding performance models in order to be able to use the heft or dmda schedulers instead.\n");
 
 	starpu_create_worker_collection_for_sched_ctx(sched_ctx_id, WORKER_LIST);
 

+ 2 - 4
src/util/starpu_cublas.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2011  Université de Bordeaux 1
- * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,9 +26,7 @@ static void init_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)
 	if (STARPU_UNLIKELY(cublasst))
 		STARPU_CUBLAS_REPORT_ERROR(cublasst);
 
-#if CUDA_VERSION >= 3010
 	cublasSetKernelStream(starpu_cuda_get_local_stream());
-#endif
 }
 
 static void shutdown_cublas_func(void *args STARPU_ATTRIBUTE_UNUSED)

+ 1 - 2
tests/datawizard/acquire_cb.c

@@ -40,9 +40,8 @@ int main(int argc, char **argv)
 	starpu_data_unregister(token_handle);
 
         FPRINTF(stderr, "Token: %u\n", token);
-        STARPU_ASSERT(token == 42);
 
 	starpu_shutdown();
 
-	return EXIT_SUCCESS;
+	return (token == 42) ? EXIT_SUCCESS : EXIT_FAILURE;
 }

+ 5 - 1
tests/datawizard/handle_to_pointer.c

@@ -21,6 +21,9 @@
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
 #endif
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
 #include <stdlib.h>
 #include "../helper.h"
 
@@ -51,8 +54,9 @@ static void cuda_task(void **buffers, void *args)
 
 	for(i = 0; i < size; i++)
 	{
-		cudaMemcpy(&numbers[i], &i, sizeof(int), cudaMemcpyHostToDevice);
+		cudaMemcpyAsync(&numbers[i], &i, sizeof(int), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
 	}
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 12 - 12
tests/datawizard/increment_redux.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -44,14 +44,14 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 	unsigned host_dst, host_src;
 
 	/* This is a dummy technique of course */
-	cudaMemcpy(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost);
-	cudaMemcpy(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaMemcpyAsync(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	host_dst += host_src;
 
-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
@@ -62,8 +62,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 
 	/* This is a dummy technique of course */
 	unsigned host_dst = 0;
-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -183,13 +183,13 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	unsigned host_token;
 
 	/* This is a dummy technique of course */
-	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	host_token++;
 
-	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 11 - 11
tests/datawizard/increment_redux_lazy.c

@@ -40,14 +40,14 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 	unsigned host_dst, host_src;
 
 	/* This is a dummy technique of course */
-	cudaMemcpy(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost);
-	cudaMemcpy(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaMemcpyAsync(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	host_dst += host_src;
 
-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
@@ -56,8 +56,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 
 	/* This is a dummy technique of course */
 	unsigned host_dst = 0;
-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -165,13 +165,13 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	unsigned host_token;
 
 	/* This is a dummy technique of course */
-	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	host_token++;
 
-	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 12 - 12
tests/datawizard/increment_redux_v2.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011-2012  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -43,14 +43,14 @@ static void redux_cuda_kernel(void *descr[], void *arg)
 	unsigned host_dst, host_src;
 
 	/* This is a dummy technique of course */
-	cudaMemcpy(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost);
-	cudaMemcpy(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaMemcpyAsync(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	host_dst += host_src;
 
-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 
 static void neutral_cuda_kernel(void *descr[], void *arg)
@@ -61,8 +61,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
 
 	/* This is a dummy technique of course */
 	unsigned host_dst = 0;
-	cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 
@@ -182,13 +182,13 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	unsigned host_token;
 
 	/* This is a dummy technique of course */
-	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	host_token++;
 
-	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 3 - 2
tests/datawizard/lazy_allocation.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2012       inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -134,7 +134,8 @@ static void cuda_check_content_codelet(void *descr[], __attribute__ ((unused)) v
 	for (i = 0; i < length; i++)
 	{
 		char dst;
-		cudaMemcpy(&dst, &buf[i], sizeof(char), cudaMemcpyDeviceToHost);
+		cudaMemcpyAsync(&dst, &buf[i], sizeof(char), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+		cudaStreamSynchronize(starpu_cuda_get_local_stream());
 		if (dst != 42)
 		{
 			FPRINTF(stderr, "buf[%u] is %c while it should be %c\n", i, dst, 42);

+ 6 - 3
tests/datawizard/manual_reduction.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -21,6 +21,7 @@
 
 #ifdef STARPU_USE_CUDA
 #include <cuda.h>
+#include <starpu_cuda.h>
 #endif
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
@@ -125,9 +126,11 @@ static void cuda_func_incr(void *descr[], void *cl_arg __attribute__((unused)))
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	unsigned h_val;
-	cudaMemcpy(&h_val, val, sizeof(unsigned), cudaMemcpyDeviceToHost);
+	cudaMemcpyAsync(&h_val, val, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	h_val++;
-	cudaMemcpy(val, &h_val, sizeof(unsigned), cudaMemcpyHostToDevice);
+	cudaMemcpyAsync(val, &h_val, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 7 - 2
tests/datawizard/write_only_tmp_buffer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,6 +23,10 @@
 #include <stdlib.h>
 #include "../helper.h"
 
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
+
 #define VECTORSIZE	1024
 
 starpu_data_handle_t v_handle;
@@ -53,7 +57,8 @@ static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_arg
 
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 
-	cudaMemset(buf, 42, 1);
+	cudaMemsetAsync(buf, 42, 1, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 4 - 4
tests/datawizard/wt_broadcast.c

@@ -60,13 +60,13 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	unsigned host_token;
 
 	/* This is a dummy technique of course */
-	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	host_token++;
 
-	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 4 - 4
tests/datawizard/wt_host.c

@@ -60,13 +60,13 @@ static void increment_cuda_kernel(void *descr[], void *arg)
 	unsigned host_token;
 
 	/* This is a dummy technique of course */
-	cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
 	host_token++;
 
-	cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 1 - 1
tests/experiments/latency/cuda_latency.c

@@ -96,7 +96,7 @@ void recv_data(unsigned src, unsigned dst)
 #ifdef ASYNC
 	cures = cudaMemcpyAsync(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyHostToDevice, stream[dst]);
 	STARPU_ASSERT(!cures);
-	cures = cudaThreadSynchronize();
+	cures = cudaStreamSynchronize(stream[dst]);
 	STARPU_ASSERT(!cures);
 #else
 	cures = cudaMemcpy(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyHostToDevice);

+ 7 - 6
tests/microbenchs/matrix_as_vector.c

@@ -20,6 +20,7 @@
 
 #ifdef STARPU_USE_CUDA
 #  include <cublas.h>
+#  include <starpu_cuda.h>
 #endif
 
 #define LOOPS 100
@@ -46,11 +47,10 @@ void vector_cuda_func(void *descr[], void *cl_arg __attribute__((unused)))
 	int nx = STARPU_VECTOR_GET_NX(descr[0]);
 
 	float sum = cublasSasum(nx, matrix, 1);
-	cudaThreadSynchronize();
 	sum /= nx;
 
-	cudaMemcpy(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif /* STARPU_USE_CUDA */
 }
 
@@ -78,11 +78,10 @@ void matrix_cuda_func(void *descr[], void *cl_arg __attribute__((unused)))
 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
 
 	float sum = cublasSasum(nx*ny, matrix, 1);
-	cudaThreadSynchronize();
 	sum /= nx*ny;
 
-	cudaMemcpy(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
+	cudaMemcpyAsync(matrix, &sum, sizeof(matrix[0]), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 #endif /* STARPU_USE_CUDA */
 }
 
@@ -195,6 +194,7 @@ int main(int argc, char **argv)
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	starpu_helper_cublas_init();
 
 	devices = starpu_cpu_worker_get_count();
 	if (devices)
@@ -219,6 +219,7 @@ int main(int argc, char **argv)
 
 error:
 	if (ret == -ENODEV) ret=STARPU_TEST_SKIPPED;
+	starpu_helper_cublas_shutdown();
 	starpu_shutdown();
 	STARPU_RETURN(ret);
 }

+ 6 - 3
tests/perfmodels/non_linear_regression_based.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011-2012  Université de Bordeaux 1
  * Copyright (C) 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2012 inria
  *
@@ -21,6 +21,9 @@
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
 #endif
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
 #include "../helper.h"
 
 #ifdef STARPU_USE_CUDA
@@ -31,8 +34,8 @@ static void memset_cuda(void *descr[], void *arg)
 	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
-	cudaMemset(ptr, 42, n * sizeof(*ptr));
-	cudaThreadSynchronize();
+	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 5 - 2
tests/perfmodels/regression_based.c

@@ -21,6 +21,9 @@
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
 #endif
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
 #include "../helper.h"
 
 #ifdef STARPU_USE_CUDA
@@ -31,8 +34,8 @@ static void memset_cuda(void *descr[], void *arg)
 	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
-	cudaMemset(ptr, 42, n * sizeof(*ptr));
-	cudaThreadSynchronize();
+	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 #endif
 

+ 3 - 3
tests/perfmodels/valid_model.c

@@ -71,7 +71,7 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
 	codelet->model = model;
 
 	old_nsamples = 0;
-	ret = starpu_load_history_debug(codelet->model->symbol, &lmodel);
+	ret = starpu_perfmodel_load_symbol(codelet->model->symbol, &lmodel);
 	if (ret != 1)
 		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
 			old_nsamples += lmodel.per_arch[archid][0].regression.nsample;
@@ -86,10 +86,10 @@ static int submit(struct starpu_codelet *codelet, struct starpu_perfmodel *model
         starpu_data_unregister(handle);
 	starpu_shutdown(); // To force dumping perf models on disk
 
-	ret = starpu_load_history_debug(codelet->model->symbol, &lmodel);
+	ret = starpu_perfmodel_load_symbol(codelet->model->symbol, &lmodel);
 	if (ret == 1)
 	{
-		FPRINTF(stderr, "The performance model could not be loaded\n");
+		FPRINTF(stderr, "The performance model for the symbol <%s> could not be loaded\n", codelet->model->symbol);
 		return 1;
 	}
 

+ 8 - 0
tools/dev/rename.sed

@@ -14,6 +14,14 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
+s/\bstruct starpu_per_arch_perfmodel\b/struct starpu_perfmodel_per_arch/g
+s/\bstruct starpu_regression_model\b/struct starpu_perfmodel_regression_model/g
+s/\bstruct starpu_history_table\b/struct starpu_perfmodel_history_table/g
+s/\bstruct starpu_history_entry\b/struct starpu_perfmodel_history_entry/g
+s/\bstruct starpu_history_list\b/struct starpu_perfmodel_history_list/g
+s/\bstarpu_list_models\b/starpu_perfmodel_list/g
+s/\bstruct starpu_model_list\b/struct _starpu_perfmodel_list/g
+s/\bstarpu_load_history_debug\b/starpu_perfmodel_load_symbol/g
 s/\bstarpu_access_mode\b/enum starpu_access_mode/g
 s/\bstruct starpu_codelet_t\b/struct starpu_codelet/g
 s/\bstarpu_codelet\b/struct starpu_codelet/g

+ 2 - 2
tools/dev/rename.sh

@@ -1,7 +1,7 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2010  Université de Bordeaux 1
-# Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -14,4 +14,4 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-find . -type f -not -path "*svn*"|xargs sed -i -f $(dirname $0)/rename.sed
+find . -type f -not -name rename.sed -not -path "*svn*"|xargs sed -i -f $(dirname $0)/rename.sed

+ 5 - 1
tools/starpu_calibrate_bus.c

@@ -71,6 +71,7 @@ static void parse_args(int argc, char **argv)
 
 int main(int argc, char **argv)
 {
+	int ret;
 	struct starpu_conf conf;
 
 	parse_args(argc, argv);
@@ -78,7 +79,10 @@ int main(int argc, char **argv)
 	starpu_conf_init(&conf);
 	conf.bus_calibrate = 1;
 
-	starpu_init(&conf);
+	ret = starpu_init(&conf);
+	if (ret == -ENODEV) return 77;
+	if (ret != 0) return ret;
+
 	starpu_shutdown();
 
 	return 0;

+ 6 - 1
tools/starpu_machine_display.c

@@ -118,6 +118,7 @@ PROGNAME);
 
 int main(int argc, char **argv)
 {
+	int ret;
 	int force = 0;
 	struct starpu_conf conf;
 
@@ -129,7 +130,11 @@ int main(int argc, char **argv)
 
 	/* Even if starpu_init returns -ENODEV, we should go on : we will just
 	 * print that we found no device. */
-	(void) starpu_init(&conf);
+	ret = starpu_init(&conf);
+	if (ret != 0 && ret != -ENODEV)
+	{
+		return ret;
+	}
 
 	unsigned ncpu = starpu_cpu_worker_get_count();
 	unsigned ncuda = starpu_cuda_worker_get_count();

+ 23 - 248
tools/starpu_perfmodel_display.c

@@ -32,16 +32,16 @@
 #define PROGNAME "starpu_perfmodel_display"
 
 /* display all available models */
-static int list = 0;
+static int plist = 0;
 /* what kernel ? */
-static char *symbol = NULL;
+static char *psymbol = NULL;
 /* what parameter should be displayed ? (NULL = all) */
-static char *parameter = NULL;
+static char *pparameter = NULL;
 /* which architecture ? (NULL = all)*/
-static char *arch = NULL;
+static char *parch = NULL;
 /* should we display a specific footprint ? */
-unsigned display_specific_footprint;
-uint32_t specific_footprint;
+static unsigned pdisplay_specific_footprint;
+static uint32_t pspecific_footprint;
 
 static void usage(char **argv)
 {
@@ -85,28 +85,28 @@ static void parse_args(int argc, char **argv)
 		{
                 case 'l':
                         /* list all models */
-                        list = 1;
+                        plist = 1;
                         break;
 
 		case 's':
 			/* symbol */
-			symbol = optarg;
+			psymbol = optarg;
 			break;
 
 		case 'p':
 			/* parameter (eg. a, b, c, mean, stddev) */
-			parameter = optarg;
+			pparameter = optarg;
 			break;
 
 		case 'a':
 			/* architecture (cpu, cuda, gordon) */
-			arch = optarg;
+			parch = optarg;
 			break;
 
 		case 'f':
 			/* footprint */
-			display_specific_footprint = 1;
-			sscanf(optarg, "%08x", &specific_footprint);
+			pdisplay_specific_footprint = 1;
+			sscanf(optarg, "%08x", &pspecific_footprint);
 			break;
 
 		case 'h':
@@ -125,7 +125,7 @@ static void parse_args(int argc, char **argv)
 		}
 	}
 
-	if (!symbol && !list)
+	if (!psymbol && !plist)
 	{
 		fprintf(stderr, "Incorrect usage, aborting\n");
                 usage(argv);
@@ -133,236 +133,6 @@ static void parse_args(int argc, char **argv)
 	}
 }
 
-static void display_history_based_perf_model(struct starpu_per_arch_perfmodel *per_arch_model)
-{
-	struct starpu_history_list *ptr;
-
-	ptr = per_arch_model->list;
-
-	if (!parameter && ptr)
-		fprintf(stderr, "# hash\t\tsize\t\tmean\t\tdev\t\tn\n");
-
-	while (ptr)
-	{
-		struct starpu_history_entry *entry = ptr->entry;
-		if (!display_specific_footprint || (entry->footprint == specific_footprint))
-		{
-			if (!parameter)
-			{
-				/* There isn't a parameter that is explicitely requested, so we display all parameters */
-				printf("%08x\t%-15lu\t%-15le\t%-15le\t%u\n", entry->footprint,
-					(unsigned long) entry->size, entry->mean, entry->deviation, entry->nsample);
-			}
-			else
-			{
-				/* only display the parameter that was specifically requested */
-				if (strcmp(parameter, "mean") == 0)
-				{
-					printf("%-15le\n", entry->mean);
-				}
-
-				if (strcmp(parameter, "stddev") == 0)
-				{
-					printf("%-15le\n", entry->deviation);
-					return;
-				}
-			}
-		}
-
-		ptr = ptr->next;
-	}
-}
-
-static void display_perf_model(struct starpu_perfmodel *model, enum starpu_perf_archtype arch, unsigned nimpl)
-{
-	struct starpu_per_arch_perfmodel *arch_model = &model->per_arch[arch][nimpl];
-	char archname[32];
-
-	if (arch_model->regression.nsample || arch_model->regression.valid || arch_model->regression.nl_valid || arch_model->list)
-	{
-		starpu_perfmodel_get_arch_name(arch, archname, 32, nimpl);
-		fprintf(stderr, "performance model for %s\n", archname);
-	}
-
-	if (parameter == NULL)
-	{
-		/* no specific parameter was requested, so we display everything */
-		if (arch_model->regression.nsample)
-		{
-			fprintf(stderr, "\tRegression : #sample = %d\n", arch_model->regression.nsample);
-		}
-
-		/* Only display the regression model if we could actually build a model */
-		if (arch_model->regression.valid)
-		{
-			fprintf(stderr, "\tLinear: y = alpha size ^ beta\n");
-			fprintf(stderr, "\t\talpha = %e\n", arch_model->regression.alpha);
-			fprintf(stderr, "\t\tbeta = %e\n", arch_model->regression.beta);
-		}
-		else
-		{
-			//fprintf(stderr, "\tLinear model is INVALID\n");
-		}
-
-		if (arch_model->regression.nl_valid)
-		{
-			fprintf(stderr, "\tNon-Linear: y = a size ^b + c\n");
-			fprintf(stderr, "\t\ta = %e\n", arch_model->regression.a);
-			fprintf(stderr, "\t\tb = %e\n", arch_model->regression.b);
-			fprintf(stderr, "\t\tc = %e\n", arch_model->regression.c);
-		}
-		else
-		{
-			//fprintf(stderr, "\tNon-Linear model is INVALID\n");
-		}
-
-		display_history_based_perf_model(arch_model);
-
-#if 0
-		char debugname[1024];
-		starpu_perfmodel_debugfilepath(model, arch, debugname, 1024, nimpl);
-		printf("\t debug file path : %s\n", debugname);
-#endif
-	}
-	else
-	{
-		/* only display the parameter that was specifically requested */
-		if (strcmp(parameter, "a") == 0)
-		{
-			printf("%e\n", arch_model->regression.a);
-			return;
-		}
-
-		if (strcmp(parameter, "b") == 0)
-		{
-			printf("%e\n", arch_model->regression.b);
-			return;
-		}
-
-		if (strcmp(parameter, "c") == 0)
-		{
-			printf("%e\n", arch_model->regression.c);
-			return;
-		}
-
-		if (strcmp(parameter, "alpha") == 0)
-		{
-			printf("%e\n", arch_model->regression.alpha);
-			return;
-		}
-
-		if (strcmp(parameter, "beta") == 0)
-		{
-			printf("%e\n", arch_model->regression.beta);
-			return;
-		}
-
-		if (strcmp(parameter, "path-file-debug") == 0)
-		{
-			char debugname[256];
-			starpu_perfmodel_debugfilepath(model, arch, debugname, 1024, nimpl);
-			printf("%s\n", debugname);
-			return;
-		}
-
-		if ((strcmp(parameter, "mean") == 0) || (strcmp(parameter, "stddev")))
-		{
-			display_history_based_perf_model(arch_model);
-			return;
-		}
-
-		/* TODO display if it's valid ? */
-
-		fprintf(stderr, "Unknown parameter requested, aborting.\n");
-		exit(-1);
-	}
-}
-
-static void display_all_perf_models(struct starpu_perfmodel *model)
-{
-	if (arch == NULL)
-	{
-		/* display all architectures */
-		unsigned archid;
-		unsigned implid;
-		for (archid = 0; archid < STARPU_NARCH_VARIATIONS; archid++)
-		{
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-			{ /* Display all codelets on each arch */
-				display_perf_model(model, (enum starpu_perf_archtype) archid, implid);
-			}
-		}
-	}
-	else
-	{
-		if (strcmp(arch, "cpu") == 0)
-		{
-			unsigned implid;
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				display_perf_model(model, STARPU_CPU_DEFAULT,implid); /* Display all codelets on cpu */
-			return;
-		}
-
-		int k;
-		if (sscanf(arch, "cpu:%d", &k) == 1)
-		{
-			/* For combined CPU workers */
-			if ((k < 1) || (k > STARPU_MAXCPUS))
-			{
-				fprintf(stderr, "Invalid CPU size\n");
-				exit(-1);
-			}
-
-			unsigned implid;
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				display_perf_model(model, (enum starpu_perf_archtype) (STARPU_CPU_DEFAULT + k - 1), implid);
-			return;
-		}
-
-		if (strcmp(arch, "cuda") == 0)
-		{
-			unsigned archid;
-			unsigned implid;
-			for (archid = STARPU_CUDA_DEFAULT; archid < STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS; archid++)
-			{
-				for (implid = 0; implid <STARPU_MAXIMPLEMENTATIONS; implid ++)
-				{
-					char archname[32];
-					starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) archid, archname, 32, implid);
-					fprintf(stderr, "performance model for %s\n", archname);
-					display_perf_model(model, (enum starpu_perf_archtype) archid, implid);
-				}
-			}
-			return;
-		}
-
-		/* There must be a cleaner way ! */
-		int gpuid;
-		int nmatched;
-		nmatched = sscanf(arch, "cuda_%d", &gpuid);
-		if (nmatched == 1)
-		{
-			unsigned archid = STARPU_CUDA_DEFAULT+ gpuid;
-			unsigned implid;
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				display_perf_model(model, (enum starpu_perf_archtype) archid, implid);
-			return;
-		}
-
-		if (strcmp(arch, "gordon") == 0)
-		{
-			fprintf(stderr, "performance model for gordon\n");
-			unsigned implid;
-			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++)
-				display_perf_model(model, STARPU_GORDON_DEFAULT, implid);
-			return;
-		}
-
-		fprintf(stderr, "Unknown architecture requested, aborting.\n");
-		exit(-1);
-	}
-}
-
 int main(int argc, char **argv)
 {
 #ifdef __MINGW32__
@@ -372,9 +142,9 @@ int main(int argc, char **argv)
 
 	parse_args(argc, argv);
 
-        if (list)
+        if (plist)
 	{
-                int ret = starpu_list_models(stdout);
+                int ret = starpu_perfmodel_list(stdout);
                 if (ret)
 		{
                         fprintf(stderr, "The performance model directory is invalid\n");
@@ -384,13 +154,18 @@ int main(int argc, char **argv)
         else
 	{
 		struct starpu_perfmodel model;
-                int ret = starpu_load_history_debug(symbol, &model);
+                int ret = starpu_perfmodel_load_symbol(psymbol, &model);
                 if (ret == 1)
 		{
-			fprintf(stderr, "The performance model could not be loaded\n");
+			fprintf(stderr, "The performance model for the symbol <%s> could not be loaded\n", psymbol);
 			return 1;
 		}
-                display_all_perf_models(&model);
+		uint32_t *footprint = NULL;
+		if (pdisplay_specific_footprint == 1)
+		{
+			footprint = &pspecific_footprint;
+		}
+		starpu_perfmodel_print_all(&model, parch, pparameter, footprint, stdout);
         }
 
 	return 0;

+ 9 - 9
tools/starpu_perfmodel_plot.c

@@ -174,7 +174,7 @@ static void display_perf_model(FILE *gnuplot_file, struct starpu_perfmodel *mode
 	char arch_name[256];
 	starpu_perfmodel_get_arch_name(arch, arch_name, 256, nimpl);
 
-	struct starpu_per_arch_perfmodel *arch_model =
+	struct starpu_perfmodel_per_arch *arch_model =
 		&model->per_arch[arch][nimpl];
 
 	if (arch_model->regression.valid || arch_model->regression.nl_valid)
@@ -220,7 +220,7 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 	char *command;
 	FILE *datafile;
 	unsigned arch;
-	struct starpu_history_list *ptr;
+	struct starpu_perfmodel_history_list *ptr;
 	char archname[32];
 	int col;
 	int len;
@@ -235,7 +235,7 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 	unsigned implid;
 	for (arch = arch1; arch < arch2; arch++) {
 		for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) {
-			struct starpu_per_arch_perfmodel *arch_model = &model->per_arch[arch][implid];
+			struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
 			starpu_perfmodel_get_arch_name((enum starpu_perf_archtype) arch, archname, 32, implid);
 
 			//ptrs[arch-arch1][implid] = ptr[arch-arch1][implid] = arch_model->list;
@@ -255,7 +255,7 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 		/* Get the next minimum */
 		for (arch = arch1; arch < arch2; arch++)
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) {
-				struct starpu_per_arch_perfmodel *arch_model = &model->per_arch[arch][implid];
+				struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
 				for (ptr = arch_model->list; ptr; ptr = ptr->next) {
 					unsigned long size = ptr->entry->size;
 					if (size > last && size < minimum)
@@ -269,9 +269,9 @@ static void display_history_based_perf_models(FILE *gnuplot_file, struct starpu_
 		fprintf(datafile, "%-15lu ", minimum);
 		for (arch = arch1; arch < arch2; arch++) {
 			for (implid = 0; implid < STARPU_MAXIMPLEMENTATIONS; implid++) {
-				struct starpu_per_arch_perfmodel *arch_model = &model->per_arch[arch][implid];
+				struct starpu_perfmodel_per_arch *arch_model = &model->per_arch[arch][implid];
 				for (ptr = arch_model->list; ptr; ptr = ptr->next) {
-					struct starpu_history_entry *entry = ptr->entry;
+					struct starpu_perfmodel_history_entry *entry = ptr->entry;
 					if (entry->size == minimum) {
 						fprintf(datafile, "\t%-15le\t%-15le", 0.001*entry->mean, 0.001*entry->deviation);
 						break;
@@ -414,7 +414,7 @@ int main(int argc, char **argv)
 	parse_args(argc, argv);
 
         if (list) {
-                int ret = starpu_list_models(stdout);
+                int ret = starpu_perfmodel_list(stdout);
                 if (ret) {
                         fprintf(stderr, "The performance model directory is invalid\n");
                         return 1;
@@ -423,10 +423,10 @@ int main(int argc, char **argv)
         }
 
 	/* Load the performance model associated to the symbol */
-	ret = starpu_load_history_debug(symbol, &model);
+	ret = starpu_perfmodel_load_symbol(symbol, &model);
 	if (ret == 1)
 	{
-		fprintf(stderr, "The performance model could not be loaded\n");
+		fprintf(stderr, "The performance model for the symbol <%s> could not be loaded\n", symbol);
 		return 1;
 	}