Corentin Salingue 12 vuotta sitten
vanhempi
commit
209fb7bf53
57 muutettua tiedostoa jossa 662 lisäystä ja 657 poistoa
  1. 33 9
      configure.ac
  2. 1 0
      doc/doxygen/Makefile.am
  3. 25 25
      doc/doxygen/chapters/advanced_examples.doxy
  4. 4 4
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  5. 4 4
      doc/doxygen/chapters/api/cuda_extensions.doxy
  6. 4 4
      doc/doxygen/chapters/api/data_interfaces.doxy
  7. 9 9
      doc/doxygen/chapters/api/data_partition.doxy
  8. 1 1
      doc/doxygen/chapters/api/explicit_dependencies.doxy
  9. 1 1
      doc/doxygen/chapters/api/insert_task.doxy
  10. 7 7
      doc/doxygen/chapters/api/lower_bound.doxy
  11. 30 30
      doc/doxygen/chapters/api/mpi.doxy
  12. 5 5
      doc/doxygen/chapters/api/opencl_extensions.doxy
  13. 1 1
      doc/doxygen/chapters/api/parallel_tasks.doxy
  14. 7 7
      doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy
  15. 2 2
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  16. 4 4
      doc/doxygen/chapters/api/scheduling_policy.doxy
  17. 7 7
      doc/doxygen/chapters/api/task_bundles.doxy
  18. 7 7
      doc/doxygen/chapters/api/task_lists.doxy
  19. 44 44
      doc/doxygen/chapters/api/top.doxy
  20. 1 1
      doc/doxygen/chapters/api/workers.doxy
  21. 30 133
      doc/doxygen/chapters/basic_examples.doxy
  22. 8 8
      doc/doxygen/chapters/building.doxy
  23. 45 0
      doc/doxygen/chapters/code/scal_pragma.cu
  24. 24 18
      doc/doxygen/chapters/code/vector_scal_opencl.c
  25. 16 0
      doc/doxygen/chapters/configure_options.doxy
  26. 0 25
      doc/doxygen/chapters/environment_variables.doxy
  27. 3 3
      doc/doxygen/chapters/fft_support.doxy
  28. 10 10
      doc/doxygen/chapters/mic_scc_support.doxy
  29. 2 2
      doc/doxygen/chapters/mpi_support.doxy
  30. 12 12
      doc/doxygen/chapters/optimize_performance.doxy
  31. 15 15
      doc/doxygen/chapters/performance_feedback.doxy
  32. 12 0
      doc/doxygen/dev/checkDoc.sh
  33. 38 0
      doc/doxygen/dev/starpu_check_documented.py
  34. 78 0
      doc/doxygen/dev/starpu_check_undocumented.sh
  35. 0 0
      doc/doxygen/dev/starpu_funcs.cocci
  36. 1 1
      doc/doxygen/refman.tex
  37. 0 0
      doc/texinfo/dev/starpu_check_documented.py
  38. 0 0
      doc/texinfo/dev/starpu_check_undocumented.sh
  39. 28 0
      doc/texinfo/dev/starpu_funcs.cocci
  40. 2 2
      examples/spmv/matrix_market/mmio.c
  41. 5 1
      examples/stencil/life_opencl.c
  42. 3 3
      include/starpu_opencl.h
  43. 4 4
      include/starpu_sched_ctx.h
  44. 1 1
      include/starpu_task.h
  45. 1 1
      include/starpu_task_util.h
  46. 11 39
      include/starpu_top.h
  47. 28 33
      mic-configure
  48. 2 2
      mpi/starpumpi-1.1.pc.in
  49. 2 2
      mpi/starpumpi-1.2.pc.in
  50. 0 5
      socl/Makefile.am
  51. 0 2
      socl/README
  52. 0 29
      socl/socl-1.0.pc.in
  53. 0 29
      socl/socl-1.1.pc.in
  54. 0 29
      socl/socl-1.2.pc.in
  55. 0 1
      socl/vendors/socl.icd.in
  56. 82 74
      src/datawizard/memalloc.c
  57. 2 1
      src/sched_policies/parallel_eager.c

+ 33 - 9
configure.ac

@@ -785,14 +785,40 @@ AC_DEFUN([STARPU_LOOK_FOR_OPENCL],
 ])
 
 if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
-	STARPU_LOOK_FOR_OPENCL()
-	# in case OpenCL was explicitely required, but is not available, this is an error
-	if test x$enable_opencl = xyes -a x$have_valid_opencl = xno; then
-	    AC_MSG_ERROR([cannot find OpenCL])
-	fi
+   case $target in
+        *-*-darwin*)
+          AC_MSG_CHECKING(whether OpenCL is available)
 
-	# now we enable OpenCL if and only if a proper setup is available
-	enable_opencl=$have_valid_opencl
+          SAVED_LIBS=$LIBS
+          LIBS="$LIBS -framework OpenCL"
+          AC_LINK_IFELSE(
+          [AC_LANG_PROGRAM([[
+          #ifdef __APPLE_CC__
+          #include <OpenCL/opencl.h>
+          #else
+          #include <CL/cl.h>
+          #endif
+          ]],
+            [[return clSetKernelArg(0, 0, 0, 0); ]])],
+          [AC_MSG_RESULT(yes)
+            enable_opencl=yes
+            have_valid_opencl=yes
+            STARPU_OPENCL_CPPFLAGS=
+            STARPU_OPENCL_LDFLAGS="-framework OpenCL"],
+          [AC_MSG_RESULT(no)
+             enable_opencl=no])
+          LIBS=$SAVED_LIBS
+          ;;        
+        *)
+	  STARPU_LOOK_FOR_OPENCL()
+	  # in case OpenCL was explicitely required, but is not available, this is an error
+	  if test x$enable_opencl = xyes -a x$have_valid_opencl = xno; then
+	    AC_MSG_ERROR([cannot find OpenCL])
+	  fi
+	  # now we enable OpenCL if and only if a proper setup is available
+	  enable_opencl=$have_valid_opencl
+          ;;
+   esac
 fi
 
 AC_MSG_CHECKING(whether OpenCL should be used)
@@ -2203,8 +2229,6 @@ AC_OUTPUT([
 	socl/Makefile
 	socl/src/Makefile
 	socl/examples/Makefile
-        socl/socl-1.0.pc
-	socl/socl-1.1.pc
 	socl/vendors/socl.icd
 	libstarpu.pc
 	starpu-1.0.pc

+ 1 - 0
doc/doxygen/Makefile.am

@@ -44,6 +44,7 @@ chapters =	\
 	chapters/mic_scc_support.doxy \
 	chapters/code/hello_pragma2.c \
 	chapters/code/hello_pragma.c \
+	chapters/code/scal_pragma.cu \
 	chapters/code/matmul_pragma.c \
 	chapters/code/matmul_pragma2.c \
 	chapters/code/cholesky_pragma.c \

+ 25 - 25
doc/doxygen/chapters/advanced_examples.doxy

@@ -92,12 +92,12 @@ thus be very fast. The function starpu_cuda_get_device_properties()
 provides a quick access to CUDA properties of CUDA devices to achieve
 such efficiency.
 
-Another example is compiling CUDA code for various compute capabilities,
+Another example is to compile CUDA code for various compute capabilities,
 resulting with two CUDA functions, e.g. <c>scal_gpu_13</c> for compute capability
 1.3, and <c>scal_gpu_20</c> for compute capability 2.0. Both functions can be
-provided to StarPU by using <c>cuda_funcs</c>, and <c>can_execute</c> can then be
-used to rule out the <c>scal_gpu_20</c> variant on a CUDA device which
-will not be able to execute it:
+provided to StarPU by using starpu_codelet::cuda_funcs, and
+starpu_codelet::can_execute can then be used to rule out the
+<c>scal_gpu_20</c> variant on a CUDA device which will not be able to execute it:
 
 \code{.c}
 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
@@ -390,9 +390,9 @@ starpu_perfmodel::size_base however permits the application to
 override that, when for instance some of the data do not matter for
 task cost (e.g. mere reference table), or when using sparse
 structures (in which case it is the number of non-zeros which matter), or when
-there is some hidden parameter such as the number of iterations, etc. The
-<c>examples/pi</c> examples uses this to include the number of iterations in the
-base.
+there is some hidden parameter such as the number of iterations, etc.
+The example in the directory <c>examples/pi</c> uses this to include
+the number of iterations in the base.
 
 How to use schedulers which can benefit from such performance model is explained
 in \ref TaskSchedulingPolicy.
@@ -427,11 +427,11 @@ starpu_bound_print_lp() or starpu_bound_print_mps() can then be used
 to output a Linear Programming problem corresponding to the schedule
 of your tasks. Run it through <c>lp_solve</c> or any other linear
 programming solver, and that will give you a lower bound for the total
-execution time of your tasks. If StarPU was compiled with the glpk
-library installed, starpu_bound_compute() can be used to solve it
+execution time of your tasks. If StarPU was compiled with the library
+<c>glpk</c> installed, starpu_bound_compute() can be used to solve it
 immediately and get the optimized minimum, in ms. Its parameter
 <c>integer</c> allows to decide whether integer resolution should be
-computed and returned too.
+computed and returned 
 
 The <c>deps</c> parameter tells StarPU whether to take tasks, implicit
 data, and tag dependencies into account. Tags released in a callback
@@ -549,7 +549,7 @@ STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
 The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
 acquiring data <c>i</c> for the main application, and will execute the code
 given as third parameter when it is acquired. In other words, as soon as the
-value of <c>i</c> computed by the <c>which_index</c> codelet can be read, the
+value of <c>i</c> computed by the codelet <c>which_index</c> can be read, the
 portion of code passed as third parameter of ::STARPU_DATA_ACQUIRE_CB will
 be executed, and is allowed to read from <c>i</c> to use it e.g. as an
 index. Note that this macro is only avaible when compiling StarPU with
@@ -609,7 +609,7 @@ struct starpu_codelet accumulate_variable_cl =
 }
 \endcode
 
-and attaches them as reduction methods for its <c>dtq</c> handle:
+and attaches them as reduction methods for its handle <c>dtq</c>:
 
 \code{.c}
 starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
@@ -674,7 +674,7 @@ tasks.
 Data can sometimes be entirely produced by a task, and entirely consumed by
 another task, without the need for other parts of the application to access
 it. In such case, registration can be done without prior allocation, by using
-the special -1 memory node number, and passing a zero pointer. StarPU will
+the special memory node number <c>-1</c>, and passing a zero pointer. StarPU will
 actually allocate memory only when the task creating the content gets scheduled,
 and destroy it on unregistration.
 
@@ -704,9 +704,8 @@ function, and free it at the end, but that would be costly. It could also
 allocate one buffer per worker (similarly to \ref
 HowToInitializeAComputationLibraryOnceForEachWorker), but that would
 make them systematic and permanent. A more  optimized way is to use
-the ::STARPU_SCRATCH data access mode, as examplified below,
-
-which provides per-worker buffers without content consistency.
+the data access mode ::STARPU_SCRATCH, as examplified below, which
+provides per-worker buffers without content consistency.
 
 \code{.c}
 starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
@@ -723,7 +722,7 @@ the other on the same worker. Also, if for instance GPU memory becomes scarce,
 StarPU will notice that it can free such buffers easily, since the content does
 not matter.
 
-The <c>examples/pi</c> example uses scratches for some temporary buffer.
+The example <c>examples/pi</c> uses scratches for some temporary buffer.
 
 \section ParallelTasks Parallel Tasks
 
@@ -734,8 +733,9 @@ parallel CPU implementation of the computation to be achieved. This can also be
 useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
 work collectively on a single task, the completion time of tasks on CPUs become
 comparable to the completion time on GPUs, thus relieving from granularity
-discrepancy concerns. Hwloc support needs to be enabled to get good performance,
-otherwise StarPU will not know how to better group cores.
+discrepancy concerns. <c>hwloc</c> support needs to be enabled to get
+good performance, otherwise StarPU will not know how to better group
+cores.
 
 Two modes of execution exist to accomodate with existing usages.
 
@@ -808,8 +808,8 @@ buffer.
 
 To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
 be used. When exposed to codelets with a flag ::STARPU_FORKJOIN or
-::STARPU_SPMD, the <c>pheft</c> (parallel-heft) and <c>peager</c>
-(parallel eager) schedulers will indeed also try to execute tasks with
+::STARPU_SPMD, the schedulers <c>pheft</c> (parallel-heft) and <c>peager</c>
+(parallel eager) will indeed also try to execute tasks with
 several CPUs. It will automatically try the various available combined
 worker sizes (making several measurements for each worker size) and
 thus be able to avoid choosing a large combined worker if the codelet
@@ -846,9 +846,9 @@ from different threads, due to the use of global variables in their sequential
 sections for instance.
 
 The solution is then to use only one combined worker at a time.  This can be
-done by setting the field starpu_conf::single_combined_worker to 1, or
+done by setting the field starpu_conf::single_combined_worker to <c>1</c>, or
 setting the environment variable \ref STARPU_SINGLE_COMBINED_WORKER
-to 1. StarPU will then run only one parallel task at a time (but other
+to <c>1</c>. StarPU will then run only one parallel task at a time (but other
 CPU and GPU tasks are not affected and can be run concurrently). The parallel
 task scheduler will however still however still try varying combined worker
 sizes to look for the most efficient ones.
@@ -1183,8 +1183,8 @@ directory <c>examples/basic_examples/dynamic_handles.c</c>.
 
 \section MoreExamples More Examples
 
-More examples are available in the StarPU sources in the <c>examples/</c>
-directory. Simple examples include:
+More examples are available in the StarPU sources in the directory
+<c>examples/</c>. Simple examples include:
 
 <dl>
 <dt> <c>incrementer/</c> </dt>

+ 4 - 4
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -569,7 +569,7 @@ starpu_codelet::modes or the \p i th element of the field
 starpu_codelet::dyn_modes (see \ref
 SettingTheDataHandlesForATask)
 
-\fn struct starpu_task * starpu_task_create(void)
+\fn struct starpu_task *starpu_task_create(void)
 \ingroup API_Codelet_And_Tasks
 Allocate a task structure and initialize it with default
 values. Tasks allocated dynamically with starpu_task_create() are
@@ -580,7 +580,7 @@ wait) and thus freed at any time. If the field starpu_task::destroy is
 explicitly unset, the resources used by the task have to be freed by
 calling starpu_task_destroy().
 
-\fn struct starpu_task * starpu_task_dup(struct starpu_task *task)
+\fn struct starpu_task *starpu_task_dup(struct starpu_task *task)
 \ingroup API_Codelet_And_Tasks
 Allocate a task structure which is the exact duplicate of the
 given task.
@@ -657,7 +657,7 @@ Return the number of submitted tasks which are ready for
 execution are already executing. It thus does not include tasks
 waiting for dependencies.
 
-\fn struct starpu_task * starpu_task_get_current(void)
+\fn struct starpu_task *starpu_task_get_current(void)
 \ingroup API_Codelet_And_Tasks
 This function returns the task currently executed by the
 worker, or <c>NULL</c> if it is called either from a thread that is not a
@@ -681,7 +681,7 @@ codelet implementation to be executed when executing the task.
 This function return the codelet implementation to be executed
 when executing the task.
 
-\fn void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps,	void (*callback)(void *), void *callback_arg)
+\fn void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg)
 \ingroup API_Codelet_And_Tasks
 This creates (and submits) an empty task that unlocks a tag once all
 its dependencies are fulfilled.

+ 4 - 4
doc/doxygen/chapters/api/cuda_extensions.doxy

@@ -26,7 +26,7 @@ create its own streams. Synchronizing with cudaThreadSynchronize() is
 allowed, but will reduce the likelihood of having all transfers
 overlapped.
 
-\fn const struct cudaDeviceProp * starpu_cuda_get_device_properties(unsigned workerid)
+\fn const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid)
 \ingroup API_CUDA_Extensions
 This function returns a pointer to device properties for worker
 \p workerid (assumed to be a CUDA worker).
@@ -35,11 +35,11 @@ This function returns a pointer to device properties for worker
 \ingroup API_CUDA_Extensions
 Report a CUDA error.
 
-\def STARPU_CUDA_REPORT_ERROR (cudaError_t status)
+\def STARPU_CUDA_REPORT_ERROR(cudaError_t status)
 \ingroup API_CUDA_Extensions
 Calls starpu_cuda_report_error(), passing the current function, file and line position.
 
-\fn int starpu_cuda_copy_async_sync (void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
+\fn int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind)
 \ingroup API_CUDA_Extensions
 Copy \p ssize bytes from the pointer \p src_ptr on \p src_node
 to the pointer \p dst_ptr on \p dst_node. The function first tries to
@@ -72,7 +72,7 @@ every CUDA device.
 \ingroup API_CUDA_Extensions
 Report a cublas error.
 
-\def STARPU_CUBLAS_REPORT_ERROR (cublasStatus status)
+\def STARPU_CUBLAS_REPORT_ERROR(cublasStatus status)
 \ingroup API_CUDA_Extensions
 Calls starpu_cublas_report_error(), passing the current
 function, file and line position.

+ 4 - 4
doc/doxygen/chapters/api/data_interfaces.doxy

@@ -671,12 +671,12 @@ row pointers...) of the matrix desginated by \p handle.
 Return a pointer to the non-zero values of the matrix
 designated by \p handle.
 
-\fn uint32_t * starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
+\fn uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 \ingroup API_Data_Interfaces
 Return a pointer to the column index, which holds the positions
 of the non-zero entries in the matrix designated by \p handle.
 
-\fn uint32_t * starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
+\fn uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
 \ingroup API_Data_Interfaces
 Return the row pointer array of the matrix designated by
 \p handle.
@@ -780,12 +780,12 @@ row pointers...) of the matrix designated by \p handle.
 Return a local pointer to the non-zero values of the matrix
 designated by \p handle.
 
-\fn uint32_t * starpu_csr_get_local_colind(starpu_data_handle_t handle)
+\fn uint32_t *starpu_csr_get_local_colind(starpu_data_handle_t handle)
 \ingroup API_Data_Interfaces
 Return a local pointer to the column index of the matrix
 designated by \p handle.
 
-\fn uint32_t * starpu_csr_get_local_rowptr(starpu_data_handle_t handle)
+\fn uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle_t handle)
 \ingroup API_Data_Interfaces
 Return a local pointer to the row pointer array of the matrix
 designated by \p handle.

+ 9 - 9
doc/doxygen/chapters/api/data_partition.doxy

@@ -71,7 +71,7 @@ This function returns the number of children.
 Return the ith child of the given \p handle, which must have been
 partitionned beforehand.
 
-\fn starpu_data_handle_t starpu_data_get_sub_data (starpu_data_handle_t root_data, unsigned depth, ... )
+\fn starpu_data_handle_t starpu_data_get_sub_data(starpu_data_handle_t root_data, unsigned depth, ... )
 \ingroup API_Data_Partition
 After partitioning a StarPU data by applying a filter,
 starpu_data_get_sub_data() can be used to get handles for each of the
@@ -192,13 +192,13 @@ functions for block data. Examples on how to use them are shown in
 <c>starpu_data_filters.h</c>. A usage example is available in
 examples/filters/shadow3d.c
 
-\fn void starpu_block_filter_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
+\fn void starpu_block_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 \ingroup API_Data_Partition
 This partitions a block along the X dimension, thus getting
 (x/\p nparts ,y,z) 3D matrices. If \p nparts does not divide x, the last
 submatrix contains the remainder.
 
-\fn void starpu_block_filter_block_shadow (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
+\fn void starpu_block_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 \ingroup API_Data_Partition
 This partitions a block along the X dimension, with a
 shadow border <c>filter_arg_ptr</c>, thus getting
@@ -207,13 +207,13 @@ divide x, the last submatrix contains the remainder. <b>IMPORTANT</b>:
 This can only be used for read-only access, as no coherency is
 enforced for the shadowed parts.
 
-\fn void starpu_block_filter_vertical_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
+\fn void starpu_block_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 \ingroup API_Data_Partition
 This partitions a block along the Y dimension, thus getting
 (x,y/\p nparts ,z) blocks. If \p nparts does not divide y, the last
 submatrix contains the remainder.
 
-\fn void starpu_block_filter_vertical_block_shadow (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
+\fn void starpu_block_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 \ingroup API_Data_Partition
 This partitions a block along the Y dimension, with a
 shadow border <c>filter_arg_ptr</c>, thus getting
@@ -222,13 +222,13 @@ divide y, the last submatrix contains the remainder. <b>IMPORTANT</b>:
 This can only be used for read-only access, as no coherency is
 enforced for the shadowed parts.
 
-\fn void starpu_block_filter_depth_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
+\fn void starpu_block_filter_depth_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 \ingroup API_Data_Partition
 This partitions a block along the Z dimension, thus getting
 (x,y,z/\p nparts) blocks. If \p nparts does not divide z, the last
 submatrix contains the remainder.
 
-\fn void starpu_block_filter_depth_block_shadow (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
+\fn void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 \ingroup API_Data_Partition
 This partitions a block along the Z dimension, with a
 shadow border <c>filter_arg_ptr</c>, thus getting
@@ -245,11 +245,11 @@ functions for BCSR data. Examples on how to use them are shown in
 \ref PartitioningData. The complete list can be found in the file
 <c>starpu_data_filters.h</c>.
 
-\fn void starpu_bcsr_filter_canonical_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
+\fn void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 \ingroup API_Data_Partition
 This partitions a block-sparse matrix into dense matrices.
 
-\fn void starpu_csr_filter_vertical_block (void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
+\fn void starpu_csr_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
 \ingroup API_Data_Partition
 This partitions a block-sparse matrix into vertical
 block-sparse matrices.

+ 1 - 1
doc/doxygen/chapters/api/explicit_dependencies.doxy

@@ -99,7 +99,7 @@ This function releases the resources associated to tag \p id.
 It can be called once the corresponding task has been executed and
 when there is no other tag that depend on this tag anymore.
 
-\fn void starpu_tag_notify_from_apps (starpu_tag_t id)
+\fn void starpu_tag_notify_from_apps(starpu_tag_t id)
 \ingroup API_Explicit_Dependencies
 This function explicitly unlocks tag \p id. It may be useful in
 the case of applications which execute part of their computation

+ 1 - 1
doc/doxygen/chapters/api/insert_task.doxy

@@ -90,7 +90,7 @@ Pack arguments of type ::STARPU_VALUE into a buffer which can be
 given to a codelet and later unpacked with the function
 starpu_codelet_unpack_args().
 
-\fn void starpu_codelet_unpack_args (void *cl_arg, ...)
+\fn void starpu_codelet_unpack_args(void *cl_arg, ...)
 \ingroup API_Insert_Task
 Retrieve the arguments of type ::STARPU_VALUE associated to a
 task automatically created using the function starpu_insert_task().

+ 7 - 7
doc/doxygen/chapters/api/lower_bound.doxy

@@ -11,36 +11,36 @@
 \brief Compute theoretical upper computation efficiency bound
 corresponding to some actual execution.
 
-\fn void starpu_bound_start (int deps, int prio)
+\fn void starpu_bound_start(int deps, int prio)
 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
 Start recording tasks (resets stats). \p deps tells whether
 dependencies should be recorded too (this is quite expensive)
 
-\fn void starpu_bound_stop (void)
+\fn void starpu_bound_stop(void)
 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
 Stop recording tasks
 
-\fn void starpu_bound_print_dot (FILE *output)
+\fn void starpu_bound_print_dot(FILE *output)
 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
 Print the DAG that was recorded
 
-\fn void starpu_bound_compute (double *res, double *integer_res, int integer)
+\fn void starpu_bound_compute(double *res, double *integer_res, int integer)
 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
 Get theoretical upper bound (in ms) (needs glpk support
 detected by configure script). It returns 0 if some performance models
 are not calibrated.
 
-\fn void starpu_bound_print_lp (FILE *output)
+\fn void starpu_bound_print_lp(FILE *output)
 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
 Emit the Linear Programming system on \p output for the recorded
 tasks, in the lp format
 
-\fn void starpu_bound_print_mps (FILE *output)
+\fn void starpu_bound_print_mps(FILE *output)
 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
 Emit the Linear Programming system on \p output for the recorded
 tasks, in the mps format
 
-\fn void starpu_bound_print (FILE *output, int integer)
+\fn void starpu_bound_print(FILE *output, int integer)
 \ingroup API_Theoretical_Lower_Bound_on_Execution_Time
 Emit statistics of actual execution vs theoretical upper bound.
 \p integer permits to choose between integer solving (which takes a

+ 30 - 30
doc/doxygen/chapters/api/mpi.doxy

@@ -11,21 +11,21 @@
 @name Initialisation
 \ingroup API_MPI_Support
 
-\fn int starpu_mpi_init (int *argc, char ***argv, int initialize_mpi)
+\fn int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi)
 \ingroup API_MPI_Support
 Initializes the starpumpi library. \p initialize_mpi indicates if MPI
 should be initialized or not by StarPU. If the value is not 0, MPI
 will be initialized by calling <c>MPI_Init_Thread(argc, argv,
 MPI_THREAD_SERIALIZED, ...)</c>.
 
-\fn int starpu_mpi_initialize (void)
+\fn int starpu_mpi_initialize(void)
 \deprecated
 \ingroup API_MPI_Support
 This function has been made deprecated. One should use instead the
 function starpu_mpi_init(). This function does not call MPI_Init(), it
 should be called beforehand.
 
-\fn int starpu_mpi_initialize_extended (int *rank, int *world_size)
+\fn int starpu_mpi_initialize_extended(int *rank, int *world_size)
 \deprecated
 \ingroup API_MPI_Support
 This function has been made deprecated. One should use instead the
@@ -33,13 +33,13 @@ function starpu_mpi_init(). MPI will be initialized by starpumpi by
 calling <c>MPI_Init_Thread(argc, argv, MPI_THREAD_SERIALIZED,
 ...)</c>.
 
-\fn int starpu_mpi_shutdown (void)
+\fn int starpu_mpi_shutdown(void)
 \ingroup API_MPI_Support
 Cleans the starpumpi library. This must be called between calling
 starpu_mpi functions and starpu_shutdown(). MPI_Finalize() will be
 called if StarPU-MPI has been initialized by starpu_mpi_init().
 
-\fn void starpu_mpi_comm_amounts_retrieve (size_t *comm_amounts)
+\fn void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts)
 \ingroup API_MPI_Support
 Retrieve the current amount of communications from the current node in
 the array \p comm_amounts which must have a size greater or equal to
@@ -50,33 +50,33 @@ the world size. Communications statistics must be enabled (see
 \anchor MPIPtpCommunication
 \ingroup API_MPI_Support
 
-\fn int starpu_mpi_send (starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
+\fn int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm)
 \ingroup API_MPI_Support
 Performs a standard-mode, blocking send of \p data_handle to the node
 \p dest using the message tag \p mpi_tag within the communicator \p
 comm.
 
-\fn int starpu_mpi_recv (starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
+\fn int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, MPI_Status *status)
 \ingroup API_MPI_Support
 Performs a standard-mode, blocking receive in \p data_handle from the
 node \p source using the message tag \p mpi_tag within the
 communicator \p comm.
 
-\fn int starpu_mpi_isend (starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm)
+\fn int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, int mpi_tag, MPI_Comm comm)
 \ingroup API_MPI_Support
 Posts a standard-mode, non blocking send of \p data_handle to the node
 \p dest using the message tag \p mpi_tag within the communicator \p
 comm. After the call, the pointer to the request \p req can be used to
 test or to wait for the completion of the communication.
 
-\fn int starpu_mpi_irecv (starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm)
+\fn int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, int mpi_tag, MPI_Comm comm)
 \ingroup API_MPI_Support
 Posts a nonblocking receive in \p data_handle from the node \p source
 using the message tag \p mpi_tag within the communicator \p comm.
 After the call, the pointer to the request \p req can be used to test
 or to wait for the completion of the communication.
 
-\fn int starpu_mpi_isend_detached (starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
+\fn int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 \ingroup API_MPI_Support
 Posts a standard-mode, non blocking send of \p data_handle to the node
 \p dest using the message tag \p mpi_tag within the communicator \p
@@ -87,7 +87,7 @@ communication completes, its resources are automatically released back
 to the system, there is no need to test or to wait for the completion
 of the request.
 
-\fn int starpu_mpi_irecv_detached (starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
+\fn int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
 \ingroup API_MPI_Support
 Posts a nonblocking receive in \p data_handle from the node \p source
 using the message tag \p mpi_tag within the communicator \p comm. On
@@ -98,34 +98,34 @@ communication completes, its resources are automatically released back
 to the system, there is no need to test or to wait for the completion
 of the request.
 
-\fn int starpu_mpi_wait (starpu_mpi_req *req, MPI_Status *status)
+\fn int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status)
 \ingroup API_MPI_Support
 Returns when the operation identified by request \p req is complete.
 
-\fn int starpu_mpi_test (starpu_mpi_req *req, int *flag, MPI_Status *status)
+\fn int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status)
 \ingroup API_MPI_Support
 If the operation identified by \p req is complete, set \p flag to 1.
 The \p status object is set to contain information on the completed
 operation.
 
-\fn int starpu_mpi_barrier (MPI_Comm comm)
+\fn int starpu_mpi_barrier(MPI_Comm comm)
 \ingroup API_MPI_Support
 Blocks the caller until all group members of the communicator \p comm
 have called it.
 
-\fn int starpu_mpi_isend_detached_unlock_tag (starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
+\fn int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
 \ingroup API_MPI_Support
 Posts a standard-mode, non blocking send of \p data_handle to the node
 \p dest using the message tag \p mpi_tag within the communicator \p
 comm. On completion, \p tag is unlocked.
 
-\fn int starpu_mpi_irecv_detached_unlock_tag (starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
+\fn int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, int mpi_tag, MPI_Comm comm, starpu_tag_t tag)
 \ingroup API_MPI_Support
 Posts a nonblocking receive in \p data_handle from the node \p source
 using the message tag \p mpi_tag within the communicator \p comm. On
 completion, \p tag is unlocked.
 
-\fn int starpu_mpi_isend_array_detached_unlock_tag (unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
+\fn int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
 \ingroup API_MPI_Support
 Posts \p array_size standard-mode, non blocking send. Each post sends
 the n-th data of the array \p data_handle to the n-th node of the
@@ -133,7 +133,7 @@ array \p dest using the n-th message tag of the array \p mpi_tag
 within the n-th communicator of the array \p comm. On completion of
 the all the requests, \p tag is unlocked.
 
-\fn int starpu_mpi_irecv_array_detached_unlock_tag (unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
+\fn int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, int *mpi_tag, MPI_Comm *comm, starpu_tag_t tag)
 \ingroup API_MPI_Support
 Posts \p array_size nonblocking receive. Each post receives in the n-th
 data of the array \p data_handle from the n-th node of the array \p
@@ -144,14 +144,14 @@ requests, \p tag is unlocked.
 @name Communication Cache
 \ingroup API_MPI_Support
 
-\fn void starpu_mpi_cache_flush (MPI_Comm comm, starpu_data_handle_t data_handle)
+\fn void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle)
 \ingroup API_MPI_Support
 Clear the send and receive communication cache for the data
 \p data_handle. The function has to be called synchronously by all the
 MPI nodes. The function does nothing if the cache mechanism is
 disabled (see \ref STARPU_MPI_CACHE).
 
-\fn void starpu_mpi_cache_flush_all_data (MPI_Comm comm)
+\fn void starpu_mpi_cache_flush_all_data(MPI_Comm comm)
 \ingroup API_MPI_Support
 Clear the send and receive communication cache for all data. The
 function has to be called synchronously by all the MPI nodes. The
@@ -162,21 +162,21 @@ function does nothing if the cache mechanism is disabled (see
 \anchor MPIInsertTask
 \ingroup API_MPI_Support
 
-\fn int starpu_data_set_tag (starpu_data_handle_t handle, int tag)
+\fn int starpu_data_set_tag(starpu_data_handle_t handle, int tag)
 \ingroup API_MPI_Support
 Tell StarPU-MPI which MPI tag to use when exchanging the data.
 
-\fn int starpu_data_get_tag (starpu_data_handle_t handle)
+\fn int starpu_data_get_tag(starpu_data_handle_t handle)
 \ingroup API_MPI_Support
 Returns the MPI tag to be used when exchanging the data.
 
-\fn int starpu_data_set_rank (starpu_data_handle_t handle, int rank)
+\fn int starpu_data_set_rank(starpu_data_handle_t handle, int rank)
 \ingroup API_MPI_Support
 Tell StarPU-MPI which MPI node "owns" a given data, that is, the node
 which will always keep an up-to-date value, and will by default
 execute tasks which write to it.
 
-\fn int starpu_data_get_rank (starpu_data_handle_t handle)
+\fn int starpu_data_get_rank(starpu_data_handle_t handle)
 \ingroup API_MPI_Support
 Returns the last value set by starpu_data_set_rank().
 
@@ -192,7 +192,7 @@ this macro is used when calling starpu_mpi_insert_task(), and must be
 followed by a data handle to specify that the node owning the given
 data will execute the codelet.
 
-\fn int starpu_mpi_insert_task (MPI_Comm comm, struct starpu_codelet *codelet, ...)
+\fn int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...)
 \ingroup API_MPI_Support
 Create and submit a task corresponding to codelet with the following
 arguments. The argument list must be zero-terminated.
@@ -230,13 +230,13 @@ The algorithm also includes a communication cache mechanism that
 allows not to send data twice to the same MPI node, unless the data
 has been modified. The cache can be disabled (see \ref STARPU_MPI_CACHE).
 
-\fn void starpu_mpi_get_data_on_node (MPI_Comm comm, starpu_data_handle_t data_handle, int node)
+\fn void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
 \ingroup API_MPI_Support
 Transfer data \p data_handle to MPI node \p node, sending it from its
 owner if needed. At least the target node and the owner have to call
 the function.
 
-\fn void starpu_mpi_get_data_on_node_detached (MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
+\fn void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
 \ingroup API_MPI_Support
 Transfer data \p data_handle to MPI node \p node, sending it from its
 owner if needed. At least the target node and the owner have to call
@@ -247,12 +247,12 @@ the argument \p arg.
 \anchor MPICollectiveOperations
 \ingroup API_MPI_Support
 
-\fn void starpu_mpi_redux_data (MPI_Comm comm, starpu_data_handle_t data_handle)
+\fn void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 \ingroup API_MPI_Support
 Perform a reduction on the given data. All nodes send the data to its
 owner node which will perform a reduction.
 
-\fn int starpu_mpi_scatter_detached (starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
+\fn int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
 \ingroup API_MPI_Support
 Scatter data among processes of the communicator based on the
 ownership of the data. For each data of the array \p data_handles, the
@@ -263,7 +263,7 @@ called with the argument \p sarg on the process \p root, the \p
 rcallback function is called with the argument \p rarg on any other
 process.
 
-\fn int starpu_mpi_gather_detached (starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
+\fn int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
 \ingroup API_MPI_Support
 Gather data from the different processes of the communicator onto the
 process \p root. Each process owning data handle in the array

+ 5 - 5
doc/doxygen/chapters/api/opencl_extensions.doxy

@@ -82,11 +82,11 @@ starpu_opencl_program array by hand for more advanced use (e.g.
 different programs on the different OpenCL devices, for relocation
 purpose for instance).
 
-\fn int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char* build_options)
+\fn int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char *build_options)
 \ingroup API_OpenCL_Extensions
 This function compiles an OpenCL source code stored in a file.
 
-\fn int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char* build_options)
+\fn int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char *build_options)
 \ingroup API_OpenCL_Extensions
 This function compiles an OpenCL source code stored in a string.
 
@@ -107,7 +107,7 @@ has been located on the system, \p located_dir_name the directory
 where it has been located. Otherwise, they are both set to the empty
 string.
 
-\fn int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char * build_options)
+\fn int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char *build_options)
 \ingroup API_OpenCL_Extensions
 Compile the OpenCL kernel stored in the file \p source_file_name
 with the given options \p build_options and stores the result in the
@@ -116,7 +116,7 @@ directory <c>$STARPU_HOME/.starpu/opencl</c> with the same filename as
 and the filename is suffixed with the vendor id and the device id of
 the OpenCL device.
 
-\fn int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char*build_options)
+\fn int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char *build_options)
 \ingroup API_OpenCL_Extensions
 Compile the OpenCL kernel in the string \p opencl_program_source
 with the given options \p build_options and stores the result in the
@@ -158,7 +158,7 @@ consumed power).
 @name OpenCL utilities
 \ingroup API_OpenCL_Extensions
 
-\fn const char * starpu_opencl_error_string(cl_int status)
+\fn const char *starpu_opencl_error_string(cl_int status)
 \ingroup API_OpenCL_Extensions
 Return the error message in English corresponding to \p status, an OpenCL
 error code.

+ 1 - 1
doc/doxygen/chapters/api/parallel_tasks.doxy

@@ -42,7 +42,7 @@ Get the description of a combined worker
 Variant of starpu_worker_can_execute_task() compatible with combined
 workers
 
-\fn void starpu_parallel_task_barrier_init(struct starpu_task*task, int workerid)
+\fn void starpu_parallel_task_barrier_init(struct starpu_task *task, int workerid)
 \ingroup API_Parallel_Tasks
 Initialise the barrier for the parallel task, and dispatch the task
 between the different combined workers.

+ 7 - 7
doc/doxygen/chapters/api/scheduling_context_hypervisor.doxy

@@ -114,7 +114,7 @@ performance counters to StarPU. By incrementing them, StarPU can help
 the hypervisor in the resizing decision making process. TODO maybe
 they should be hidden to the user
 
-\fn struct starpu_sched_ctx_performance_counters *sc_hypervisor_init(struct sc_hypervisor_policy * policy)
+\fn struct starpu_sched_ctx_performance_counters *sc_hypervisor_init(struct sc_hypervisor_policy *policy)
 \ingroup API_Scheduling_Context_Hypervisor
 Initializes the hypervisor to use the strategy provided as parameter
 and creates the performance counters (see starpu_sched_ctx_performance_counters).
@@ -148,7 +148,7 @@ flops the context will execute (needed for Gflops rate based strategy
 see \ref ResizingStrategies or any other custom strategy needing it, for
 the others we can pass 0.0)
 
-\fn void sc_hypervisor_unregister_ctx (unsigned sched_ctx)
+\fn void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 \ingroup API_Scheduling_Context_Hypervisor
 Unregister the context from the hypervisor.
 
@@ -268,11 +268,11 @@ struct sc_hypervisor_policy dummy_policy =
 \ingroup API_Scheduling_Context_Hypervisor
     Moves workers from one context to another
 
-\fn struct sc_hypervisor_policy_config * sc_hypervisor_get_config(unsigned sched_ctx);
+\fn struct sc_hypervisor_policy_config *sc_hypervisor_get_config(unsigned sched_ctx);
 \ingroup API_Scheduling_Context_Hypervisor
     Returns the configuration structure of a context
 
-\fn int * sc_hypervisor_get_sched_ctxs();
+\fn int *sc_hypervisor_get_sched_ctxs();
 \ingroup API_Scheduling_Context_Hypervisor
     Gets the contexts managed by the hypervisor
 
@@ -280,15 +280,15 @@ struct sc_hypervisor_policy dummy_policy =
 \ingroup API_Scheduling_Context_Hypervisor
     Gets the number of contexts managed by the hypervisor
 
-\fn struct sc_hypervisor_wrapper * sc_hypervisor_get_wrapper(unsigned sched_ctx);
+\fn struct sc_hypervisor_wrapper *sc_hypervisor_get_wrapper(unsigned sched_ctx);
 \ingroup API_Scheduling_Context_Hypervisor
     Returns the wrapper corresponding the context \p sched_ctx
 
-\fn double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper * sc_w);
+\fn double sc_hypervisor_get_elapsed_flops_per_sched_ctx(struct sc_hypervisor_wrapper *sc_w);
 \ingroup API_Scheduling_Context_Hypervisor
     Returns the flops of a context elapsed from the last resize
 
-\fn char * sc_hypervisor_get_policy();
+\fn char *sc_hypervisor_get_policy();
 \ingroup API_Scheduling_Context_Hypervisor
     Returns the name of the resizing policy the hypervisor uses
 

+ 2 - 2
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -212,7 +212,7 @@ policy of the given scheduler context.
 @name Scheduling Context Worker Collection
 \ingroup API_Scheduling_Contexts
 
-\fn struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type)
+\fn struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type)
 \ingroup API_Scheduling_Contexts
 Create a worker collection of the type indicated by the last parameter
 for the context specified through the first parameter.
@@ -221,7 +221,7 @@ for the context specified through the first parameter.
 \ingroup API_Scheduling_Contexts
 Delete the worker collection of the specified scheduling context
 
-\fn struct starpu_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id)
+\fn struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 Return the worker collection managed by the indicated context
 

+ 4 - 4
doc/doxygen/chapters/api/scheduling_policy.doxy

@@ -58,7 +58,7 @@ starpu_init().
 \var starpu_sched_policy::policy_description
         Optional field. Human readable description of the policy.
 
-\fn struct starpu_sched_policy ** starpu_sched_get_predefined_policies()
+\fn struct starpu_sched_policy **starpu_sched_get_predefined_policies()
 \ingroup API_Scheduling_Policy
 Return an NULL-terminated array of all the predefined scheduling
 policies.
@@ -73,13 +73,13 @@ condition variable. For instance, in the case of a scheduling strategy
 with a single task queue, the same condition variable would be used to
 block and wake up all workers.
 
-\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void * policy_data)
+\fn void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data)
 \ingroup API_Scheduling_Policy
 Each scheduling policy uses some specific data (queues, variables,
 additional condition variables). It is memorize through a local
 structure. This function assigns it to a scheduling context.
 
-\fn void* starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
+\fn void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
 \ingroup API_Scheduling_Policy
 Returns the policy data previously assigned to a context
 
@@ -135,7 +135,7 @@ otherwise the task may fail to execute.
 \ingroup API_Scheduling_Policy
 Return the current date in micro-seconds.
 
-\fn uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task * task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+\fn uint32_t starpu_task_footprint(struct starpu_perfmodel *model, struct starpu_task *task, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 \ingroup API_Scheduling_Policy
 Returns the footprint for a given task
 

+ 7 - 7
doc/doxygen/chapters/api/task_bundles.doxy

@@ -15,12 +15,12 @@ on the same worker whenever it’s possible. It must be considered as a
 hint given to the scheduler as there is no guarantee that they will be
 executed on the same worker.
 
-\fn void starpu_task_bundle_create (starpu_task_bundle_t *bundle)
+\fn void starpu_task_bundle_create(starpu_task_bundle_t *bundle)
 \ingroup API_Task_Bundles
 Factory function creating and initializing \p bundle, when the call
 returns, memory needed is allocated and \p bundle is ready to use.
 
-\fn int starpu_task_bundle_insert (starpu_task_bundle_t bundle, struct starpu_task *task)
+\fn int starpu_task_bundle_insert(starpu_task_bundle_t bundle, struct starpu_task *task)
 \ingroup API_Task_Bundles
 Insert \p task in \p bundle. Until \p task is removed from \p bundle
 its expected length and data transfer time will be considered along
@@ -30,7 +30,7 @@ On success, it returns 0. There are two cases of error : if \p bundle
 is already closed it returns <c>-EPERM</c>, if \p task was already
 submitted it returns <c>-EINVAL</c>.
 
-\fn int starpu_task_bundle_remove (starpu_task_bundle_t bundle, struct starpu_task *task)
+\fn int starpu_task_bundle_remove(starpu_task_bundle_t bundle, struct starpu_task *task)
 \ingroup API_Task_Bundles
 Remove \p task from \p bundle. Of course \p task must have been
 previously inserted in \p bundle. This function must not be called if
@@ -38,21 +38,21 @@ previously inserted in \p bundle. This function must not be called if
 so would result in undefined behaviour. On success, it returns 0. If
 \p bundle is already closed it returns <c>-ENOENT</c>.
 
-\fn void starpu_task_bundle_close (starpu_task_bundle_t bundle)
+\fn void starpu_task_bundle_close(starpu_task_bundle_t bundle)
 \ingroup API_Task_Bundles
 Inform the runtime that the user will not modify \p bundle anymore, it
 means no more inserting or removing task. Thus the runtime can destroy
 it when possible.
 
-\fn double starpu_task_bundle_expected_length (starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+\fn double starpu_task_bundle_expected_length(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 \ingroup API_Task_Bundles
 Return the expected duration of \p bundle in micro-seconds.
 
-\fn double starpu_task_bundle_expected_power (starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
+\fn double starpu_task_bundle_expected_power(starpu_task_bundle_t bundle, enum starpu_perfmodel_archtype arch, unsigned nimpl)
 \ingroup API_Task_Bundles
 Return the expected power consumption of \p bundle in J.
 
-\fn double starpu_task_bundle_expected_data_transfer_time (starpu_task_bundle_t bundle, unsigned memory_node)
+\fn double starpu_task_bundle_expected_data_transfer_time(starpu_task_bundle_t bundle, unsigned memory_node)
 \ingroup API_Task_Bundles
 Return the time (in micro-seconds) expected to transfer all data used within \p bundle.
 

+ 7 - 7
doc/doxygen/chapters/api/task_lists.doxy

@@ -28,11 +28,11 @@ Push \p task at the front of \p list
 \ingroup API_Task_Lists
 Push \p task at the back of \p list
 
-\fn struct starpu_task * starpu_task_list_front(struct starpu_task_list *list)
+\fn struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
 \ingroup API_Task_Lists
 Get the front of \p list (without removing it)
 
-\fn struct starpu_task * starpu_task_list_back(struct starpu_task_list *list)
+\fn struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
 \ingroup API_Task_Lists
 Get the back of \p list (without removing it)
 
@@ -44,23 +44,23 @@ Test if \p list is empty
 \ingroup API_Task_Lists
 Remove \p task from \p list
 
-\fn struct starpu_task * starpu_task_list_pop_front(struct starpu_task_list *list)
+\fn struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list)
 \ingroup API_Task_Lists
 Remove the element at the front of \p list
 
-\fn struct starpu_task * starpu_task_list_pop_back(struct starpu_task_list *list)
+\fn struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list)
 \ingroup API_Task_Lists
 Remove the element at the back of \p list
 
-\fn struct starpu_task * starpu_task_list_begin(struct starpu_task_list *list)
+\fn struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
 \ingroup API_Task_Lists
 Get the first task of \p list.
 
-\fn struct starpu_task * starpu_task_list_end(struct starpu_task_list *list)
+\fn struct starpu_task *starpu_task_list_end(struct starpu_task_list *list)
 \ingroup API_Task_Lists
 Get the end of \p list.
 
-\fn struct starpu_task * starpu_task_list_next(struct starpu_task *task)
+\fn struct starpu_task *starpu_task_list_next(struct starpu_task *task)
 \ingroup API_Task_Lists
 Get the next task of \p list. This is not erase-safe.
 

+ 44 - 44
doc/doxygen/chapters/api/top.doxy

@@ -9,62 +9,62 @@
 /*! \defgroup API_StarPUTop_Interface StarPU-Top Interface
 
 \enum starpu_top_data_type
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 StarPU-Top Data type
 \var starpu_top_data_type::STARPU_TOP_DATA_BOOLEAN
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 \var starpu_top_data_type::STARPU_TOP_DATA_INTEGER
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 \var starpu_top_data_type::STARPU_TOP_DATA_FLOAT
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 
 \enum starpu_top_param_type
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 StarPU-Top Parameter type
 \var starpu_top_param_type::STARPU_TOP_PARAM_BOOLEAN
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 \var starpu_top_param_type::STARPU_TOP_PARAM_INTEGER
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 \var starpu_top_param_type::STARPU_TOP_PARAM_FLOAT
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 \var starpu_top_param_type::STARPU_TOP_PARAM_ENUM
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 
 \enum starpu_top_message_type
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 StarPU-Top Message type
 \var starpu_top_message_type::TOP_TYPE_GO
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 \var starpu_top_message_type::TOP_TYPE_SET
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 \var starpu_top_message_type::TOP_TYPE_CONTINUE
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 \var starpu_top_message_type::TOP_TYPE_ENABLE
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 \var starpu_top_message_type::TOP_TYPE_DISABLE
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 \var starpu_top_message_type::TOP_TYPE_DEBUG
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 \var starpu_top_message_type::TOP_TYPE_UNKNOW
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 todo
 
 \struct starpu_top_data
 todo
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 \var starpu_top_data::id
 todo
 \var starpu_top_data::name
@@ -86,7 +86,7 @@ todo
 
 \struct starpu_top_param
 todo
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 \var starpu_top_param::id
 todo
 \var starpu_top_param::name
@@ -113,98 +113,98 @@ todo
 todo
 
 @name Functions to call before the initialisation
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 
-\fn struct starpu_top_data *starpu_top_add_data_boolean(const char* data_name, int active)
-\ingroup API_StarPU-Top_Interface
+\fn struct starpu_top_data *starpu_top_add_data_boolean(const char *data_name, int active)
+\ingroup API_StarPUTop_Interface
 This fonction register a data named data_name of type boolean.
 If \p active=0, the value will NOT be displayed to user by default.
 Any other value will make the value displayed by default.
 
-\fn struct starpu_top_data * starpu_top_add_data_integer(const char* data_name, int minimum_value, int maximum_value, int active)
-\ingroup API_StarPU-Top_Interface
+\fn struct starpu_top_data *starpu_top_add_data_integer(const char *data_name, int minimum_value, int maximum_value, int active)
+\ingroup API_StarPUTop_Interface
 This fonction register a data named \p data_name of type integer. The
 minimum and maximum value will be usefull to define the scale in UI.
 If \p active=0, the value will NOT be displayed to user by default.
 Any other value will make the value displayed by default.
 
-\fn struct starpu_top_data* starpu_top_add_data_float(const char* data_name, double minimum_value, double maximum_value, int active)
-\ingroup API_StarPU-Top_Interface
+\fn struct starpu_top_data *starpu_top_add_data_float(const char *data_name, double minimum_value, double maximum_value, int active)
+\ingroup API_StarPUTop_Interface
 This fonction register a data named data_name of type float. The
 minimum and maximum value will be usefull to define the scale in UI.
 If \p active=0, the value will NOT be displayed to user by default.
 Any other value will make the value displayed by default.
 
-\fn struct starpu_top_param* starpu_top_register_parameter_boolean(const char* param_name, int* parameter_field, void (*callback)(struct starpu_top_param*))
-\ingroup API_StarPU-Top_Interface
+\fn struct starpu_top_param *starpu_top_register_parameter_boolean(const char *param_name, int *parameter_field, void (*callback)(struct starpu_top_param*))
+\ingroup API_StarPUTop_Interface
 This fonction register a parameter named \p parameter_name, of type
 boolean. The \p callback fonction will be called when the parameter is
 modified by UI, and can be null.
 
-\fn struct starpu_top_param* starpu_top_register_parameter_float(const char* param_name, double* parameter_field, double minimum_value, double maximum_value, void (*callback)(struct starpu_top_param*))
-\ingroup API_StarPU-Top_Interface
+\fn struct starpu_top_param *starpu_top_register_parameter_float(const char *param_name, double *parameter_field, double minimum_value, double maximum_value, void (*callback)(struct starpu_top_param*))
+\ingroup API_StarPUTop_Interface
 his fonction register a parameter named \p param_name, of type
 integer. Minimum and maximum value will be used to prevent user seting
 incorrect value. The \p callback fonction will be called when the
 parameter is modified by UI, and can be null.
 
-\fn struct starpu_top_param* starpu_top_register_parameter_integer(const char* param_name, int* parameter_field, int minimum_value, int maximum_value, void (*callback)(struct starpu_top_param*))
-\ingroup API_StarPU-Top_Interface
+\fn struct starpu_top_param *starpu_top_register_parameter_integer(const char *param_name, int *parameter_field, int minimum_value, int maximum_value, void (*callback)(struct starpu_top_param*))
+\ingroup API_StarPUTop_Interface
 This fonction register a parameter named \p param_name, of type float.
 Minimum and maximum value will be used to prevent user seting
 incorrect value. The \p callback fonction will be called when the
 parameter is modified by UI, and can be null.
 
-\fn struct starpu_top_param* starpu_top_register_parameter_enum(const char* param_name, int* parameter_field, char** values, int nb_values, void (*callback)(struct starpu_top_param*))
-\ingroup API_StarPU-Top_Interface
+\fn struct starpu_top_param *starpu_top_register_parameter_enum(const char *param_name, int *parameter_field, char **values, int nb_values, void (*callback)(struct starpu_top_param*))
+\ingroup API_StarPUTop_Interface
 This fonction register a parameter named \p param_name, of type enum.
 Minimum and maximum value will be used to prevent user seting
 incorrect value. The \p callback fonction will be called when the
 parameter is modified by UI, and can be null.
 
 @name Initialisation
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 
 \fn void starpu_top_init_and_wait(const char *server_name)
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 This function must be called when all parameters and data have been
 registered AND initialised (for parameters). This function will wait
 for a TOP to connect, send initialisation sentences, and wait for the
 GO message.
 
 @name To call after initialisation
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 
 \fn void starpu_top_update_parameter(const struct starpu_top_param *param)
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 This function should be called after every modification of a parameter
 from something other than starpu_top. This fonction notice UI that the
 configuration changed.
 
 \fn void starpu_top_update_data_boolean(const struct starpu_top_data *data, int value)
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 This function updates the value of the starpu_top_data on UI.
 
 \fn void starpu_top_update_data_integer(const struct starpu_top_data *data, int value)
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 This function updates the value of the starpu_top_data on UI.
 
 \fn void starpu_top_update_data_float(const struct starpu_top_data *data, double value)
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 This function updates the value of the starpu_top_data on UI.
 
 \fn void starpu_top_task_prevision(struct starpu_task *task, int devid, unsigned long long start, unsigned long long end)
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 This function notifies UI than the task have been planed to run from start to end, on computation-core.
 
 \fn void starpu_top_debug_log(const char *message)
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 This function is useful in debug mode. The starpu developper doesn't
 need to check if the debug mode is active. This is checked by
 starpu_top itsefl. It just send a message to display by UI.
 
 \fn void starpu_top_debug_lock(const char *message)
-\ingroup API_StarPU-Top_Interface
+\ingroup API_StarPUTop_Interface
 This function is useful in debug mode. The starpu developper doesn't
 need to check if the debug mode is active. This is checked by
 starpu_top itsefl. It send a message and wait for a continue message

+ 1 - 1
doc/doxygen/chapters/api/workers.doxy

@@ -99,7 +99,7 @@ The returned value should be at most \ref STARPU_MAXSCCDEVS.
 This function returns the number of OpenCL devices controlled by
 StarPU. The returned value should be at most \ref STARPU_MAXOPENCLDEVS.
 
-\fn int starpu_worker_get_id (void)
+\fn int starpu_worker_get_id(void)
 \ingroup API_Workers_Properties
 This function returns the identifier of the current worker, i.e
 the one associated to the calling thread. The returned value is either

+ 30 - 133
doc/doxygen/chapters/basic_examples.doxy

@@ -12,7 +12,7 @@
 
 This section shows how to implement a simple program that submits a task
 to StarPU using the StarPU C extension (\ref cExtensions). The complete example, and additional examples,
-is available in the <c>gcc-plugin/examples</c> directory of the StarPU
+is available in the directory <c>gcc-plugin/examples</c> of the StarPU
 distribution. A similar example showing how to directly use the StarPU's API is shown
 in \ref HelloWorldUsingStarPUAPI.
 
@@ -24,7 +24,7 @@ has a single implementation for CPU:
 
 \snippet hello_pragma.c To be included
 
-The code can then be compiled and linked with GCC and the <c>-fplugin</c> flag:
+The code can then be compiled and linked with GCC and the flag <c>-fplugin</c>:
 
 \verbatim
 $ gcc `pkg-config starpu-1.2 --cflags` hello-starpu.c \
@@ -92,9 +92,9 @@ compiler implicitly do it as examplified above.
 The field starpu_codelet::nbuffers specifies the number of data buffers that are
 manipulated by the codelet: here the codelet does not access or modify any data
 that is controlled by our data management library. Note that the argument
-passed to the codelet (the field starpu_task::cl_arg) does not count
-as a buffer since it is not managed by our data management library,
-but just contain trivial parameters.
+passed to the codelet (the parameter <c>cl_arg</c> of the function
+<c>cpu_func</c>) does not count as a buffer since it is not managed by
+our data management library, but just contain trivial parameters.
 
 \internal
 TODO need a crossref to the proper description of "where" see bla for more ...
@@ -168,7 +168,7 @@ int main(int argc, char **argv)
 \endcode
 
 Before submitting any tasks to StarPU, starpu_init() must be called. The
-<c>NULL</c> argument specifies that we use default configuration. Tasks cannot
+<c>NULL</c> argument specifies that we use the default configuration. Tasks cannot
 be submitted after the termination of StarPU by a call to
 starpu_shutdown().
 
@@ -194,12 +194,13 @@ computational kernel that multiplies its input vector by a constant,
 the constant could be specified by the means of this buffer, instead
 of registering it as a StarPU data. It must however be noted that
 StarPU avoids making copy whenever possible and rather passes the
-pointer as such, so the buffer which is pointed at must kept allocated
+pointer as such, so the buffer which is pointed at must be kept allocated
 until the task terminates, and if several tasks are submitted with
 various parameters, each of them must be given a pointer to their
-buffer.	
+own buffer.
 
-Once a task has been executed, an optional callback function is be called.
+Once a task has been executed, an optional callback function
+starpu_task::callback_func is called when defined.
 While the computational kernel could be offloaded on various architectures, the
 callback function is always executed on a CPU. The pointer
 starpu_task::callback_arg is passed as an argument of the callback
@@ -211,7 +212,7 @@ void (*callback_function)(void *);
 
 If the field starpu_task::synchronous is non-zero, task submission
 will be synchronous: the function starpu_task_submit() will not return
-until the task was executed. Note that the function starpu_shutdown()
+until the task has been executed. Note that the function starpu_shutdown()
 does not guarantee that asynchronous tasks have been executed before
 it returns, starpu_task_wait_for_all() can be used to that effect, or
 data can be unregistered (starpu_data_unregister()), which will
@@ -237,12 +238,12 @@ we show how StarPU tasks can manipulate data.
 
 We will first show how to use the C language extensions provided by
 the GCC plug-in (\ref cExtensions). The complete example, and
-additional examples, is available in the <c>gcc-plugin/examples</c>
-directory of the StarPU distribution. These extensions map directly
+additional examples, is available in the directory <c>gcc-plugin/examples</c>
+of the StarPU distribution. These extensions map directly
 to StarPU's main concepts: tasks, task implementations for CPU,
 OpenCL, or CUDA, and registered data buffers. The standard C version
-that uses StarPU's standard C programming interface is given in the
-next section (\ref VectorScalingUsingStarPUAPI).
+that uses StarPU's standard C programming interface is given in \ref
+VectorScalingUsingStarPUAPI.
 
 First of all, the vector-scaling task and its simple CPU implementation
 has to be defined:
@@ -268,7 +269,7 @@ implemented:
 
 \snippet hello_pragma2.c To be included
 
-The <c>main</c> function above does several things:
+The function <c>main</c> above does several things:
 
 <ul>
 <li>
@@ -287,22 +288,20 @@ StarPU to transfer that memory region between GPUs and the main memory.
 Removing this <c>pragma</c> is an error.
 </li>
 <li>
-It invokes the <c>vector_scal</c> task.  The invocation looks the same
+It invokes the task <c>vector_scal</c>.  The invocation looks the same
 as a standard C function call.  However, it is an asynchronous
 invocation, meaning that the actual call is performed in parallel with
 the caller's continuation.
 </li>
 <li>
-It waits for the termination of the <c>vector_scal</c>
-asynchronous call.
+It waits for the termination of the asynchronous call <c>vector_scal</c>.
 </li>
 <li>
 Finally, StarPU is shut down.
 </li>
 </ul>
 
-The program can be compiled and linked with GCC and the <c>-fplugin</c>
-flag:
+The program can be compiled and linked with GCC and the flag <c>-fplugin</c>:
 
 \verbatim
 $ gcc `pkg-config starpu-1.2 --cflags` vector_scal.c \
@@ -317,7 +316,7 @@ And voilà!
 Now, this is all fine and great, but you certainly want to take
 advantage of these newfangled GPUs that your lab just bought, don't you?
 
-So, let's add an OpenCL implementation of the <c>vector_scal</c> task.
+So, let's add an OpenCL implementation of the task <c>vector_scal</c>.
 We assume that the OpenCL kernel is available in a file,
 <c>vector_scal_opencl_kernel.cl</c>, not shown here.  The OpenCL task
 implementation is similar to that used with the standard C API
@@ -374,14 +373,14 @@ vector_scal_opencl (unsigned size, float vector[size], float factor)
 \endcode
 
 The OpenCL kernel itself must be loaded from <c>main</c>, sometime after
-the <c>initialize</c> pragma:
+the pragma <c>initialize</c>:
 
 \code{.c}
 starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
                                        &cl_programs, "");
 \endcode
 
-And that's it.  The <c>vector_scal</c> task now has an additional
+And that's it.  The task <c>vector_scal</c> now has an additional
 implementation, for OpenCL, which StarPU's scheduler may choose to use
 at run-time.  Unfortunately, the <c>vector_scal_opencl</c> above still
 has to go through the common OpenCL boilerplate; in the future,
@@ -404,40 +403,13 @@ The actual implementation of the CUDA task goes into a separate
 compilation unit, in a <c>.cu</c> file.  It is very close to the
 implementation when using StarPU's standard C API (\ref DefinitionOfTheCUDAKernel).
 
-\code{.c}
-/* CUDA implementation of the `vector_scal' task, to be compiled with `nvcc'. */
-
-#include <starpu.h>
-#include <stdlib.h>
-
-static __global__ void
-vector_mult_cuda (unsigned n, float *val, float factor)
-{
-  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (i < n)
-    val[i] *= factor;
-}
-
-/* Definition of the task implementation declared in the C file. */
-extern "C" void
-vector_scal_cuda (size_t size, float vector[], float factor)
-{
-  unsigned threads_per_block = 64;
-  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
-
-  vector_mult_cuda <<< nblocks, threads_per_block, 0,
-    starpu_cuda_get_local_stream () >>> (size, vector, factor);
+\snippet scal_pragma.cu To be included
 
-  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
-}
-\endcode
-
-The complete source code, in the <c>gcc-plugin/examples/vector_scal</c>
-directory of the StarPU distribution, also shows how an SSE-specialized
+The complete source code, in the directory <c>gcc-plugin/examples/vector_scal</c>
+of the StarPU distribution, also shows how an SSE-specialized
 CPU task implementation can be added.
 
-For more details on the C extensions provided by StarPU's GCC plug-in,
+For more details on the C extensions provided by StarPU's GCC plug-in, see
 \ref cExtensions.
 
 \section VectorScalingUsingStarPUAPI Vector Scaling Using StarPU's API
@@ -479,7 +451,7 @@ starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
 The first argument, called the <b>data handle</b>, is an opaque pointer which
 designates the array in StarPU. This is also the structure which is used to
 describe which data is used by a task. The second argument is the node number
-where the data originally resides. Here it is 0 since the <c>vector array</c> is in
+where the data originally resides. Here it is 0 since the array <c>vector</c> is in
 the main memory. Then comes the pointer <c>vector</c> where the data can be found in main memory,
 the number of elements in the vector and the size of each element.
 The following shows how to construct a StarPU task that will manipulate the
@@ -569,36 +541,9 @@ The CUDA implementation can be written as follows. It needs to be compiled with
 a CUDA compiler such as nvcc, the NVIDIA CUDA compiler driver. It must be noted
 that the vector pointer returned by ::STARPU_VECTOR_GET_PTR is here a
 pointer in GPU memory, so that it can be passed as such to the
-<c>vector_mult_cuda</c> kernel call.
-
-\code{.c}
-#include <starpu.h>
+kernel call <c>vector_mult_cuda</c>.
 
-static __global__ void vector_mult_cuda(unsigned n, float *val,
-                                        float factor)
-{
-    unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
-    if (i < n)
-        val[i] *= factor;
-}
-
-extern "C" void scal_cuda_func(void *buffers[], void *_args)
-{
-    float *factor = (float *)_args;
-
-    /* length of the vector */
-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* CUDA copy of the vector pointer */
-    float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
-    unsigned threads_per_block = 64;
-    unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
-
-    vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>
-                    (n, val, *factor);
-
-    cudaStreamSynchronize(starpu_cuda_get_local_stream());
-}
-\endcode
+\snippet vector_scal_cuda.cu To be included
 
 \subsection DefinitionOfTheOpenCLKernel Definition of the OpenCL Kernel
 
@@ -620,55 +565,7 @@ which returns a <c>cl_mem</c> (which is not a device pointer, but an OpenCL
 handle), which can be passed as such to the OpenCL kernel. The difference is
 important when using partitioning, see \ref PartitioningData.
 
-\code{.c}
-#include <starpu.h>
-
-extern struct starpu_opencl_program programs;
-
-void scal_opencl_func(void *buffers[], void *_args)
-{
-    float *factor = _args;
-    int id, devid, err;     /* OpenCL specific code */
-    cl_kernel kernel;       /* OpenCL specific code */
-    cl_command_queue queue; /* OpenCL specific code */
-    cl_event event;         /* OpenCL specific code */
-
-    /* length of the vector */
-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-    /* OpenCL copy of the vector pointer */
-    cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
-
-    { /* OpenCL specific code */
-        id = starpu_worker_get_id();
-        devid = starpu_worker_get_devid(id);
-
-	err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
-	                       "vector_mult_opencl", devid);   /* Name of the codelet defined above */
-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-
-        err = clSetKernelArg(kernel, 0, sizeof(n), &n);
-        err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
-        err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
-        if (err) STARPU_OPENCL_REPORT_ERROR(err);
-    }
-
-    {  /* OpenCL specific code */
-        size_t global=n;
-        size_t local=1;
-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
-        if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-    }
-
-    {  /* OpenCL specific code */
-        clFinish(queue);
-        starpu_opencl_collect_stats(event);
-        clReleaseEvent(event);
-
-        starpu_opencl_release_kernel(kernel);
-    }
-}
-\endcode
-
+\snippet vector_scal_opencl.c To be included
 
 \subsection DefinitionOfTheMainCode Definition of the Main Code
 

+ 8 - 8
doc/doxygen/chapters/building.doxy

@@ -134,8 +134,8 @@ $ make install
 \endverbatim
 
 Libtool interface versioning information are included in
-libraries names (libstarpu-1.2.so, libstarpumpi-1.2.so and
-libstarpufft-1.2.so).
+libraries names (<c>libstarpu-1.2.so</c>, <c>libstarpumpi-1.2.so</c> and
+<c>libstarpufft-1.2.so</c>).
 
 \section SettingUpYourOwnCode Setting up Your Own Code
 
@@ -145,10 +145,10 @@ StarPU provides a pkg-config executable to obtain relevant compiler
 and linker flags.
 Compiling and linking an application against StarPU may require to use
 specific flags or libraries (for instance <c>CUDA</c> or <c>libspe2</c>).
-To this end, it is possible to use the <c>pkg-config</c> tool.
+To this end, it is possible to use the tool <c>pkg-config</c>.
 
 If StarPU was not installed at some standard location, the path of StarPU's
-library must be specified in the <c>PKG_CONFIG_PATH</c> environment variable so
+library must be specified in the environment variable <c>PKG_CONFIG_PATH</c> so
 that <c>pkg-config</c> can find it. For example if StarPU was installed in
 <c>$prefix_dir</c>:
 
@@ -175,10 +175,10 @@ Make sure that <c>pkg-config --libs starpu-1.2</c> actually produces some output
 before going further: <c>PKG_CONFIG_PATH</c> has to point to the place where
 <c>starpu-1.2.pc</c> was installed during <c>make install</c>.
 
-Also pass the <c>--static</c> option if the application is to be
+Also pass the option <c>--static</c> if the application is to be
 linked statically.
 
-It is also necessary to set the variable <c>LD_LIBRARY_PATH</c> to
+It is also necessary to set the environment variable <c>LD_LIBRARY_PATH</c> to
 locate dynamic libraries at runtime.
 
 \verbatim
@@ -283,10 +283,10 @@ multiplication using BLAS and cuBLAS. They output the obtained GFlops.
 
 \subsection CholeskyFactorization Cholesky Factorization
 
-<c>cholesky\*</c> perform a Cholesky factorization (single precision). They use different dependency primitives.
+<c>cholesky/*</c> perform a Cholesky factorization (single precision). They use different dependency primitives.
 
 \subsection LUFactorization LU Factorization
 
-<c>lu\*</c> perform an LU factorization. They use different dependency primitives.
+<c>lu/*</c> perform an LU factorization. They use different dependency primitives.
 
 */

+ 45 - 0
doc/doxygen/chapters/code/scal_pragma.cu

@@ -0,0 +1,45 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+//! [To be included]
+/* CUDA implementation of the `vector_scal' task, to be compiled with `nvcc'. */
+
+#include <starpu.h>
+#include <stdlib.h>
+
+static __global__ void
+vector_mult_cuda (unsigned n, float *val, float factor)
+{
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (i < n)
+    val[i] *= factor;
+}
+
+/* Definition of the task implementation declared in the C file. */
+extern "C" void
+vector_scal_cuda (size_t size, float vector[], float factor)
+{
+  unsigned threads_per_block = 64;
+  unsigned nblocks = (size + threads_per_block - 1) / threads_per_block;
+
+  vector_mult_cuda <<< nblocks, threads_per_block, 0,
+    starpu_cuda_get_local_stream () >>> (size, vector, factor);
+
+  cudaStreamSynchronize (starpu_cuda_get_local_stream ());
+}
+//! [To be included]

+ 24 - 18
doc/doxygen/chapters/code/vector_scal_opencl.c

@@ -24,29 +24,33 @@ extern struct starpu_opencl_program programs;
 void scal_opencl_func(void *buffers[], void *_args)
 {
     float *factor = _args;
-    int id, devid, err;
-    cl_kernel kernel;
-    cl_command_queue queue;
-    cl_event event;
+    int id, devid, err;                   /* OpenCL specific code */
+    cl_kernel kernel;                     /* OpenCL specific code */
+    cl_command_queue queue;               /* OpenCL specific code */
+    cl_event event;                       /* OpenCL specific code */
 
     /* length of the vector */
     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
     /* OpenCL copy of the vector pointer */
     cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
-    id = starpu_worker_get_id();
-    devid = starpu_worker_get_devid(id);
+    {  /* OpenCL specific code */
+	 id = starpu_worker_get_id();
+	 devid = starpu_worker_get_devid(id);
 
-    err = starpu_opencl_load_kernel(&kernel, &queue, &programs, "vector_mult_opencl",
-                                    devid);
-    if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	 err = starpu_opencl_load_kernel(&kernel, &queue,
+					 &programs,
+					 "vector_mult_opencl", /* Name of the codelet */
+					 devid);
+	 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-    err = clSetKernelArg(kernel, 0, sizeof(n), &n);
-    err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
-    err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
-    if (err) STARPU_OPENCL_REPORT_ERROR(err);
+	 err = clSetKernelArg(kernel, 0, sizeof(n), &n);
+	 err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
+	 err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
+	 if (err) STARPU_OPENCL_REPORT_ERROR(err);
+    }
 
-    {
+    {   /* OpenCL specific code */
         size_t global=n;
         size_t local;
         size_t s;
@@ -63,10 +67,12 @@ void scal_opencl_func(void *buffers[], void *_args)
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
     }
 
-    clFinish(queue);
-    starpu_opencl_collect_stats(event);
-    clReleaseEvent(event);
+    {  /* OpenCL specific code */
+	 clFinish(queue);
+	 starpu_opencl_collect_stats(event);
+	 clReleaseEvent(event);
 
-    starpu_opencl_release_kernel(kernel);
+	 starpu_opencl_release_kernel(kernel);
+    }
 }
 //! [To be included]

+ 16 - 0
doc/doxygen/chapters/configure_options.doxy

@@ -329,6 +329,22 @@ Use the compiler <c>mpicc</c> at <c>path</c>, for StarPU-MPI.
 Enable the activity polling method for StarPU-MPI.
 </dd>
 
+<dt>--with-coi-dir</dt>
+<dd>
+\anchor with-coi-dir
+\addindex __configure__--with-coi-dir
+Specify the directory to the COI library for MIC support.
+The default value is <c>/opt/intel/mic/coi</c>
+</dd>
+
+<dt>--mic-host</dt>
+<dd>
+\anchor mic-host
+\addindex __configure__--mic-host
+Specify the precise MIC architecture host identifier.
+The default value is <c>x86_64-k1om-linux</c>
+</dd>
+
 \section AdvancedConfiguration Advanced Configuration
 
 <dl>

+ 0 - 25
doc/doxygen/chapters/environment_variables.doxy

@@ -345,31 +345,6 @@ to 0. It is enabled by default or for any other values of the variable
 \ref STARPU_MPI_CACHE.
 </dd>
 
-<dt>STARPU_MIC_HOST</dt>
-<dd>
-\anchor STARPU_MIC_HOST
-\addindex __env__STARPU_MIC_HOST
-Defines the value of the parameter <c>--host</c> passed to
-<c>configure</c> for the cross-compilation. The current default is
-<c>x86_64-k1om-linux</c>.
-</dd>
-
-<dt>STARPU_MIC_CC_PATH</dt>
-<dd>
-\anchor STARPU_MIC_CC_PATH
-\addindex __env__STARPU_MIC_CC_PATH
-Defines the path to the MIC cross-compiler. The current default is
-<c>/usr/linux-k1om-4.7/bin/</c>
-</dd>
-
-<dt>STARPU_COI_DIR</dt>
-<dd>
-\anchor STARPU_COI_DIR
-\addindex __env__STARPU_COI_DIR
-Defines the path to the COI library. The current default is
-<c>/opt/intel/mic/coi</c>.
-</dd>
-
 </dl>
 
 \section MiscellaneousAndDebug Miscellaneous And Debug

+ 3 - 3
doc/doxygen/chapters/fft_support.doxy

@@ -9,7 +9,7 @@
 /*! \page FFTSupport FFT Support
 
 StarPU provides <c>libstarpufft</c>, a library whose design is very similar to
-both fftw and cufft, the difference being that it takes benefit from both CPUs
+both <c>fftw</c> and <c>cufft</c>, the difference being that it takes benefit from both CPUs
 and GPUs. It should however be noted that GPUs do not have the same precision as
 CPUs, so the results may different by a negligible amount.
 
@@ -33,7 +33,7 @@ The documentation below is given with names for double precision, replace
 
 Only complex numbers are supported at the moment.
 
-The application has to call starpu_init() before calling starpufft functions.
+The application has to call starpu_init() before calling <c>starpufft</c> functions.
 
 Either main memory pointers or data handles can be provided.
 
@@ -66,6 +66,6 @@ $ pkg-config --cflags starpufft-1.2  # options for the compiler
 $ pkg-config --libs starpufft-1.2    # options for the linker
 \endverbatim
 
-Also pass the <c>--static</c> option if the application is to be linked statically.
+Also pass the option <c>--static</c> if the application is to be linked statically.
 
 */

+ 10 - 10
doc/doxygen/chapters/mic_scc_support.doxy

@@ -13,19 +13,13 @@
 SCC support just needs the presence of the RCCE library.
 
 MIC support actually needs two compilations of StarPU, one for the host and one for
-the device. The script <c>mic-configure</c> can be used to achieve this: it basically
+the device. The PATH environment variable has to include the path to the
+cross-compilation toolchain, for instance <c>/usr/linux-k1om-4.7/bin</c>
+The script <c>mic-configure</c> can then be used to achieve the two compilations: it basically
 calls <c>configure</c> as appropriate from two new directories: <c>build_mic</c> and
 <c>build_host</c>. <c>make</c> and <c>make install</c> can then be used as usual and will
 recurse into both directories.
 
-\internal
-TODO: move to configuration section?
-\endinternal
-
-It can be parameterized with the environment variables \ref
-STARPU_MIC_HOST, \ref STARPU_MIC_CC_PATH and \ref STARPU_COI_DIR.
-
-
 \section PortingApplicationsToMICSCC Porting Applications To MIC/SCC
 
 The simplest way to port an application to MIC/SCC is to set the field
@@ -49,8 +43,14 @@ the MIC-cross-built binary. It will look for the file given by the
 environment variable \ref STARPU_MIC_SINK_PROGRAM_NAME or in the
 directory given by the environment variable \ref
 STARPU_MIC_SINK_PROGRAM_PATH, or in the field
-starpu_config::mic_sink_program_path. It will also look in the current
+starpu_conf::mic_sink_program_path. It will also look in the current
 directory for the same binary name plus the suffix <c>-mic</c> or
 <c>_mic</c>.
 
+The testsuite can be started by simply running <c>make check</c> from the
+top directory. It will recurse into both <c>build_host</c> to run tests with only
+the host, and into <c>build_mic</c> to run tests with both the host and the MIC
+devices. Single tests with the host and the MIC can be run by starting
+<c>./loader-cross.sh ./the_test</c> from <c>build_mic/tests</c>.
+
 */

+ 2 - 2
doc/doxygen/chapters/mpi_support.doxy

@@ -31,7 +31,7 @@ $ pkg-config --cflags starpumpi-1.2  # options for the compiler
 $ pkg-config --libs starpumpi-1.2    # options for the linker
 \endverbatim
 
-You also need pass the <c>--static</c> option if the application is to
+You also need pass the option <c>--static</c> if the application is to
 be linked statically.
 
 \code{.c}
@@ -257,7 +257,7 @@ int my_distrib(int x, int y, int nb_nodes) {
 
 Now the data can be registered within StarPU. Data which are not
 owned but will be needed for computations can be registered through
-the lazy allocation mechanism, i.e. with a <c>home_node</c> set to -1.
+the lazy allocation mechanism, i.e. with a <c>home_node</c> set to <c>-1</c>.
 StarPU will automatically allocate the memory when it is used for the
 first time.
 

+ 12 - 12
doc/doxygen/chapters/optimize_performance.doxy

@@ -37,7 +37,7 @@ starpu_data_set_wt_mask(img_handle, 1<<0);
 \endcode
 
 will for instance request to always automatically transfer a replicate into the
-main memory (node 0), as bit 0 of the write-through bitmask is being set.
+main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
 
 \code{.c}
 starpu_data_set_wt_mask(img_handle, ~0U);
@@ -108,7 +108,7 @@ possibility according to task size, one can run
 speedup of independent tasks of very small sizes.
 
 The choice of scheduler also has impact over the overhead: for instance, the
-<c>dmda</c> scheduler takes time to make a decision, while <c>eager</c> does
+ scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
 not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
 impact that has on the target machine.
 
@@ -132,7 +132,7 @@ priority information to StarPU.
 
 \section TaskSchedulingPolicy Task Scheduling Policy
 
-By default, StarPU uses the <c>eager</c> simple greedy scheduler. This is
+By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
 because it provides correct load balance even if the application codelets do not
 have performance models. If your application codelets have performance models
 (\ref PerformanceModelExample), you should change the scheduler thanks
@@ -276,14 +276,14 @@ and in Joules for the energy consumption models.
 
 Distributing tasks to balance the load induces data transfer penalty. StarPU
 thus needs to find a balance between both. The target function that the
-<c>dmda</c> scheduler of StarPU
+scheduler <c>dmda</c> of StarPU
 tries to minimize is <c>alpha * T_execution + beta * T_data_transfer</c>, where
 <c>T_execution</c> is the estimated execution time of the codelet (usually
 accurate), and <c>T_data_transfer</c> is the estimated data transfer time. The
 latter is estimated based on bus calibration before execution start,
 i.e. with an idle machine, thus without contention. You can force bus
 re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
-beta parameter defaults to 1, but it can be worth trying to tweak it
+beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
 by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
 real application execution, contention makes transfer times bigger.
 This is of course imprecise, but in practice, a rough estimation
@@ -291,7 +291,7 @@ already gives the good results that a precise estimation would give.
 
 \section DataPrefetch Data Prefetch
 
-The <c>heft</c>, <c>dmda</c> and <c>pheft</c> scheduling policies
+The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
 perform data prefetch (see \ref STARPU_PREFETCH):
 as soon as a scheduling decision is taken for a task, requests are issued to
 transfer its required data to the target processing unit, if needeed, so that
@@ -310,9 +310,9 @@ the handle and the desired target memory node.
 \section Power-basedScheduling Power-based Scheduling
 
 If the application can provide some power performance model (through
-the <c>power_model</c> field of the codelet structure), StarPU will
+the field starpu_codelet::power_model), StarPU will
 take it into account when distributing tasks. The target function that
-the <c>dmda</c> scheduler minimizes becomes <c>alpha * T_execution +
+the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
 beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
 is the estimated task consumption in Joules. To tune this parameter, use
 <c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
@@ -333,7 +333,7 @@ On-line task consumption measurement is currently only supported through the
 <c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
 simulator. Applications can however provide explicit measurements by
 using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
-with the <c>power_model</c> performance model. Fine-grain
+with the <c>power_model</c> performance model). Fine-grain
 measurement is often not feasible with the feedback provided by the hardware, so
 the user can for instance run a given task a thousand times, measure the global
 consumption for that series of tasks, divide it by a thousand, repeat for
@@ -446,9 +446,9 @@ $ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
 TEST PASSED
 \endverbatim
 
-Note that we force to use the dmda scheduler to generate performance
-models for the application. The application may need to be run several
-times before the model is calibrated.
+Note that we force to use the scheduler <c>dmda</c> to generate
+performance models for the application. The application may need to be
+run several times before the model is calibrated.
 
 \subsection Simulation Simulation
 

+ 15 - 15
doc/doxygen/chapters/performance_feedback.doxy

@@ -16,7 +16,7 @@ nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
 install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
 <c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
 sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
-to your application, any options you want to pass it, the path to libayudame.so.
+to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
 
 Make sure to specify at least the same number of CPUs in the dialog box as your
 machine has, otherwise an error will happen during execution. Future versions
@@ -35,7 +35,7 @@ call starpu_profiling_status_set() with the parameter
 is already enabled or not by calling starpu_profiling_status_get().
 Enabling monitoring also reinitialize all previously collected
 feedback. The environment variable \ref STARPU_PROFILING can also be
-set to 1 to achieve the same effect.
+set to <c>1</c> to achieve the same effect.
 
 Likewise, performance monitoring is stopped by calling
 starpu_profiling_status_set() with the parameter
@@ -247,7 +247,7 @@ Or you can simply point the <c>PKG_CONFIG_PATH</c> to
 \ref with-fxt "--with-fxt" to <c>./configure</c>
 
 When FxT is enabled, a trace is generated when StarPU is terminated by calling
-starpu_shutdown()). The trace is a binary file whose name has the form
+starpu_shutdown(). The trace is a binary file whose name has the form
 <c>prof_file_XXX_YYY</c> where <c>XXX</c> is the user name, and
 <c>YYY</c> is the pid of the process that used StarPU. This file is saved in the
 <c>/tmp/</c> directory by default, or by the directory specified by
@@ -269,7 +269,7 @@ application shutdown.
 This will create a file <c>paje.trace</c> in the current directory that
 can be inspected with the <a href="http://vite.gforge.inria.fr/">ViTE trace
 visualizing open-source tool</a>.  It is possible to open the
-<c>paje.trace</c> file with ViTE by using the following command:
+file <c>paje.trace</c> with ViTE by using the following command:
 
 \verbatim
 $ vite paje.trace
@@ -322,7 +322,7 @@ generate an activity trace by calling:
 $ starpu_fxt_tool -i filename
 \endverbatim
 
-This will create an <c>activity.data</c> file in the current
+This will create a file <c>activity.data</c> in the current
 directory. A profile of the application showing the activity of StarPU
 during the execution of the program can be generated:
 
@@ -341,7 +341,7 @@ efficiently. The black sections indicate that the processing unit was blocked
 because there was no task to process: this may indicate a lack of parallelism
 which may be alleviated by creating more tasks when it is possible.
 
-The second part of the <c>activity.eps</c> picture is a graph showing the
+The second part of the picture <c>activity.eps</c> is a graph showing the
 evolution of the number of tasks available in the system during the execution.
 Ready tasks are shown in black, and tasks that are submitted but not
 schedulable yet are shown in grey.
@@ -360,8 +360,8 @@ file: <starpu_slu_lu_model_22.hannibal>
 file: <starpu_slu_lu_model_12.hannibal>
 \endverbatim
 
-Here, the codelets of the lu example are available. We can examine the
-performance of the 22 kernel (in micro-seconds), which is history-based:
+Here, the codelets of the example <c>lu</c> are available. We can examine the
+performance of the kernel <c>22</c> (in micro-seconds), which is history-based:
 
 \verbatim
 $ starpu_perfmodel_display -s starpu_slu_lu_model_22
@@ -414,7 +414,7 @@ starpu_perfmodel_load_symbol(). The source code of the tool
 
 The tool <c>starpu_perfmodel_plot</c> can be used to draw performance
 models. It writes a <c>.gp</c> file in the current directory, to be
-run in the <c>gnuplot</c> tool, which shows the corresponding curve.
+run with the tool <c>gnuplot</c>, which shows the corresponding curve.
 
 When the field starpu_task::flops is set, <c>starpu_perfmodel_plot</c> can
 directly draw a GFlops curve, by simply adding the <c>-f</c> option:
@@ -448,13 +448,13 @@ $ starpu_perfmodel_plot -s non_linear_memset_regression_based -i /tmp/prof_file_
 It will produce a <c>.gp</c> file which contains both the performance model
 curves, and the profiling measurements.
 
-If you have the <c>R</c> statistical tool installed, you can additionally use
+If you have the statistical tool <c>R</c> installed, you can additionally use
 
 \verbatim
 $ starpu_codelet_histo_profile distrib.data
 \endverbatim
 
-Which will create one pdf file per codelet and per input size, showing a
+Which will create one <c>.pdf</c> file per codelet and per input size, showing a
 histogram of the codelet execution time distribution.
 
 \section TheoreticalLowerBoundOnExecutionTime Theoretical Lower Bound On Execution Time
@@ -475,13 +475,13 @@ use this.
 \section MemoryFeedback Memory Feedback
 
 It is possible to enable memory statistics. To do so, you need to pass
-the option \ref enable-memory-stats "--enable-memory-stats" when running configure. It is then
-possible to call the function starpu_display_memory_stats() to
+the option \ref enable-memory-stats "--enable-memory-stats" when running <c>configure</c>. It is then
+possible to call the function starpu_data_display_memory_stats() to
 display statistics about the current data handles registered within StarPU.
 
 Moreover, statistics will be displayed at the end of the execution on
 data handles which have not been cleared out. This can be disabled by
-setting the environment variable \ref STARPU_MEMORY_STATS to 0.
+setting the environment variable \ref STARPU_MEMORY_STATS to <c>0</c>.
 
 For example, if you do not unregister data at the end of the complex
 example, you will get something similar to:
@@ -552,7 +552,7 @@ of the application. To enable them, you need to pass the option
 starpu_shutdown() various statistics will be displayed,
 execution, MSI cache statistics, allocation cache statistics, and data
 transfer statistics. The display can be disabled by setting the
-environment variable \ref STARPU_STATS to 0.
+environment variable \ref STARPU_STATS to <c>0</c>.
 
 \verbatim
 $ ./examples/cholesky/cholesky_tag

+ 12 - 0
doc/doxygen/dev/checkDoc.sh

@@ -0,0 +1,12 @@
+#!/bin/bash
+
+x=$(grep ingroup chapters/api/*|awk -F':' '{print $2}'| awk 'NF != 2')
+if test -n "$x" ; then
+    echo Errors on group definitions
+    echo $x
+fi
+
+echo
+echo "Defined groups"
+grep ingroup chapters/api/*|awk -F':' '{print $2}'| awk 'NF == 2'|sort|uniq
+echo

+ 38 - 0
doc/doxygen/dev/starpu_check_documented.py

@@ -0,0 +1,38 @@
+#!/usr/bin/python
+
+import os
+
+class bcolors:
+    FAILURE = '\033[91m'
+    NORMAL = '\033[0m'
+
+def loadFunctionsAndDatatypes(flist, dtlist, fname):
+    f = open(fname, 'r')
+    for line in f:
+        mline = line[:-1]
+        if mline.count("\\fn"):
+            if mline.count("fft") == 0:
+                func = mline.replace("\\fn ", "")
+                flist.append(list([func, fname]))
+        if mline.count("\\struct ") or mline.count("\\def ") or mline.count("\\typedef ") or mline.count("\\enum "):
+            datatype = mline.replace("\\struct ", "").replace("\\def ", "").replace("\\typedef ", "").replace("\\enum ","")
+            dtlist.append(list([datatype, fname]))
+    f.close()
+
+functions = []
+datatypes = []
+
+for docfile in os.listdir('chapters/api'):
+    if docfile.count(".doxy"):
+        loadFunctionsAndDatatypes(functions, datatypes, "chapters/api/"+docfile)
+
+for function in functions:
+    x = os.system("fgrep -l \"" + function[0] + "\" ../../include/*.h ../../mpi/include/*.h ../../starpufft/*h ../../sc_hypervisor/include/*.h > /dev/null")
+    if x != 0:
+        print "Function <" + bcolors.FAILURE + function[0] + bcolors.NORMAL + "> documented in <" + function[1] + "> does not exist in StarPU's API"
+
+for datatype in datatypes:
+    x = os.system("fgrep -l \"" + datatype[0] + "\" ../../include/*.h ../../mpi/include/*.h ../../starpufft/*h ../../sc_hypervisor/include/*.h > /dev/null")
+    if x != 0:
+        print "Datatype <" + bcolors.FAILURE + datatype[0] + bcolors.NORMAL + "> documented in <" + datatype[1] + "> does not exist in StarPU's API"
+

+ 78 - 0
doc/doxygen/dev/starpu_check_undocumented.sh

@@ -0,0 +1,78 @@
+#!/bin/bash
+# Note: expects Coccinelle's spatch command n the PATH
+# See: http://coccinelle.lip6.fr/
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2011, 2012, 2013 Centre National de la Recherche Scientifique
+# Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+stcolor=$(tput sgr0)
+redcolor=$(tput setaf 1)
+greencolor=$(tput setaf 2)
+
+H_FILES=$(find include mpi/include -name '*.h')
+
+functions=$(spatch -very_quiet -sp_file tools/dev/starpu_funcs.cocci $H_FILES)
+for func in $functions ; do
+	fname=$(echo $func|awk -F ',' '{print $1}')
+	location=$(echo $func|awk -F ',' '{print $2}')
+	x=$(grep "$fname(" doc/doxygen/chapters/api/*.doxy | grep "\\fn")
+	if test "$x" == "" ; then
+		echo "function ${redcolor}${fname}${stcolor} at location ${redcolor}$location${stcolor} is not (or incorrectly) documented"
+#	else
+#		echo "function ${greencolor}${fname}${stcolor} at location $location is correctly documented"
+	fi
+done
+
+echo
+
+structs=$(grep "struct starpu" $H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
+for struct in $structs ; do
+    x=$(grep -F "\\struct $struct" doc/doxygen/chapters/api/*.doxy)
+    if test "$x" == "" ; then
+	echo "struct ${redcolor}${struct}${stcolor} is not (or incorrectly) documented"
+    fi
+done
+
+echo
+
+enums=$(grep "enum starpu" $H_FILES | grep -v "[;|,|(|)]" | awk '{print $2}')
+for enum in $enums ; do
+    x=$(grep -F "\\enum $enum" doc/doxygen/chapters/api/*.doxy)
+    if test "$x" == "" ; then
+	echo "enum ${redcolor}${enum}${stcolor} is not (or incorrectly) documented"
+    fi
+done
+
+echo
+
+macros=$(grep "define\b" $H_FILES |grep -v deprecated|grep "#" | grep -v "__" | sed 's/#[ ]*/#/g' | awk '{print $2}' | awk -F'(' '{print $1}' | sort|uniq)
+for macro in $macros ; do
+    x=$(grep -F "\\def $macro" doc/doxygen/chapters/api/*.doxy)
+    if test "$x" == "" ; then
+	echo "macro ${redcolor}${macro}${stcolor} is not (or incorrectly) documented"
+    fi
+done
+
+echo
+
+variables=$(grep --exclude-dir=.svn -rs -E "(getenv|get_env)" src/| tr ' ' '\012'|grep -E "(getenv|get_env)" | grep "\"" | sed 's/.*("//' | sed 's/").*//'|sort|uniq)
+for variable in $variables ; do
+    x=$(grep "$variable" doc/doxygen/chapters/environment_variables.doxy | grep "\\anchor")
+    if test "$x" == "" ; then
+	echo "variable ${redcolor}${variable}${stcolor} is not (or incorrectly) documented"
+    fi
+done
+

tools/dev/starpu_funcs.cocci → doc/doxygen/dev/starpu_funcs.cocci


+ 1 - 1
doc/doxygen/refman.tex

@@ -114,7 +114,7 @@ Documentation License”.
 \hypertarget{AdvancedExamples}{}
 \input{AdvancedExamples}
 
-\chapter{How to optimize performance with StarPU}
+\chapter{How To Optimize Performance With StarPU}
 \label{HowToOptimizePerformanceWithStarPU}
 \hypertarget{HowToOptimizePerformanceWithStarPU}{}
 \input{HowToOptimizePerformanceWithStarPU}

tools/dev/starpu_check_documented.py → doc/texinfo/dev/starpu_check_documented.py


tools/dev/starpu_check_undocumented.sh → doc/texinfo/dev/starpu_check_undocumented.sh


+ 28 - 0
doc/texinfo/dev/starpu_funcs.cocci

@@ -0,0 +1,28 @@
+// StarPU --- Runtime system for heterogeneous multicore architectures.
+//
+// Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
+//
+// StarPU is free software; you can redistribute it and/or modify
+// it under the terms of the GNU Lesser General Public License as published by
+// the Free Software Foundation; either version 2.1 of the License, or (at
+// your option) any later version.
+//
+// StarPU is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+//
+// See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+@starpufunc@
+position p;
+type t;
+identifier f =~ "starpu";
+@@
+
+t f@p( ... );
+
+@ script:python @
+p << starpufunc.p;
+f << starpufunc.f;
+@@
+print "%s,%s:%s" % (f,p[0].file,p[0].line)

+ 2 - 2
examples/spmv/matrix_market/mmio.c

@@ -277,8 +277,8 @@ int mm_write_mtx_array_size(FILE *f, int M, int N)
 /* use when I[], J[], and val[]J, and val[] are already allocated */
 /******************************************************************/
 
-int mm_read_mtx_crd_data(FILE *f, int M 
-				 int N, int nz, int I[], int J[],
+int mm_read_mtx_crd_data(FILE *f, int M,
+			 int N, int nz, int I[], int J[],
         double val[], MM_typecode matcode)
 {
     int i;

+ 5 - 1
examples/stencil/life_opencl.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,7 +19,11 @@
 /* #define _externC extern "C" */
 
 #include <stencil.h>
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 #include <starpu.h>
 
 #define str(x) #x

+ 3 - 3
include/starpu_opencl.h

@@ -61,12 +61,12 @@ void starpu_opencl_get_current_queue(cl_command_queue *queue);
 
 void starpu_opencl_load_program_source(const char *source_file_name, char *located_file_name, char *located_dir_name, char *opencl_program_source);
 int starpu_opencl_compile_opencl_from_file(const char *source_file_name, const char *build_options);
-int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char* build_options);
+int starpu_opencl_compile_opencl_from_string(const char *opencl_program_source, const char *file_name, const char *build_options);
 
 int starpu_opencl_load_binary_opencl(const char *kernel_id, struct starpu_opencl_program *opencl_programs);
 
-int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char* build_options);
-int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char* build_options);
+int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char *build_options);
+int starpu_opencl_load_opencl_from_string(const char *opencl_program_source, struct starpu_opencl_program *opencl_programs, const char *build_options);
 int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs);
 
 int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs, const char *kernel_name, int devid);

+ 4 - 4
include/starpu_sched_ctx.h

@@ -67,14 +67,14 @@ unsigned starpu_sched_ctx_check_if_hypervisor_exists(void);
 
 void starpu_sched_ctx_set_policy_data(unsigned sched_ctx_id, void *policy_data);
 
-void* starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
+void *starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
 
 
-struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
+struct starpu_worker_collection *starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
 
 void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
 
-struct starpu_worker_collection* starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
+struct starpu_worker_collection *starpu_sched_ctx_get_worker_collection(unsigned sched_ctx_id);
 
 unsigned starpu_sched_ctx_get_nworkers(unsigned sched_ctx_id);
 
@@ -112,7 +112,7 @@ int starpu_sched_ctx_set_max_priority(unsigned sched_ctx_id, int max_prio);
 #define STARPU_DEFAULT_PRIO	0
 
 /* execute any parallel code on the workers of the sched_ctx (workers are blocked) */
-void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, unsigned sched_ctx_id);
+void *starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void *param, unsigned sched_ctx_id);
 
 #ifdef __cplusplus
 }

+ 1 - 1
include/starpu_task.h

@@ -239,7 +239,7 @@ void starpu_codelet_display_stats(struct starpu_codelet *cl);
 
 struct starpu_task *starpu_task_get_current(void);
 
-void starpu_parallel_task_barrier_init(struct starpu_task* task, int workerid);
+void starpu_parallel_task_barrier_init(struct starpu_task *task, int workerid);
 
 struct starpu_task *starpu_task_dup(struct starpu_task *task);
 

+ 1 - 1
include/starpu_task_util.h

@@ -29,7 +29,7 @@ extern "C"
 {
 #endif
 
-void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps,	void (*callback)(void *), void *callback_arg);
+void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg);
 
 #define STARPU_VALUE		 (1<<19)
 #define STARPU_CALLBACK		 (1<<20)

+ 11 - 39
include/starpu_top.h

@@ -82,50 +82,22 @@ enum starpu_top_message_type
 	TOP_TYPE_UNKNOW
 };
 
-struct starpu_top_data *starpu_top_add_data_boolean(const char *data_name,
-						    int active);
-struct starpu_top_data *starpu_top_add_data_integer(const char *data_name,
-						     int minimum_value,
-						     int maximum_value,
-						     int active);
-struct starpu_top_data *starpu_top_add_data_float(const char *data_name,
-						  double minimum_value,
-						  double maximum_value,
-						  int active);
-struct starpu_top_param *starpu_top_register_parameter_boolean(const char *param_name,
-							       int *parameter_field,
-							       void (*callback)(struct starpu_top_param*));
-struct starpu_top_param *starpu_top_register_parameter_integer(const char *param_name,
-							       int *parameter_field,
-							       int minimum_value,
-							       int maximum_value,
-							       void (*callback)(struct starpu_top_param*));
-struct starpu_top_param *starpu_top_register_parameter_float(const char *param_name,
-							     double *parameter_field,
-							     double minimum_value,
-							     double maximum_value,
-							     void (*callback)(struct starpu_top_param*));
-struct starpu_top_param *starpu_top_register_parameter_enum(const char *param_name,
-							    int *parameter_field,
-							    char **values,
-							    int nb_values,
-							    void (*callback)(struct starpu_top_param*));
-
-
+struct starpu_top_data *starpu_top_add_data_boolean(const char *data_name, int active);
+struct starpu_top_data *starpu_top_add_data_integer(const char *data_name, int minimum_value, int maximum_value, int active);
+struct starpu_top_data *starpu_top_add_data_float(const char *data_name, double minimum_value, double maximum_value, int active);
 
+struct starpu_top_param *starpu_top_register_parameter_boolean(const char *param_name, int *parameter_field, void (*callback)(struct starpu_top_param*));
+struct starpu_top_param *starpu_top_register_parameter_integer(const char *param_name, int *parameter_field, int minimum_value, int maximum_value, void (*callback)(struct starpu_top_param*));
+struct starpu_top_param *starpu_top_register_parameter_float(const char *param_name, double *parameter_field, double minimum_value, double maximum_value, void (*callback)(struct starpu_top_param*));
+struct starpu_top_param *starpu_top_register_parameter_enum(const char *param_name, int *parameter_field, char **values, int nb_values, void (*callback)(struct starpu_top_param*));
 
 void starpu_top_init_and_wait(const char *server_name);
 
 void starpu_top_update_parameter(const struct starpu_top_param *param);
-void starpu_top_update_data_boolean(const struct starpu_top_data *data,
-				    int value);
-void starpu_top_update_data_integer(const struct starpu_top_data *data,
-				    int value);
-void starpu_top_update_data_float(const struct starpu_top_data *data,
-				  double value);
-void starpu_top_task_prevision(struct starpu_task *task,
-			       int devid, unsigned long long start,
-			       unsigned long long end);
+void starpu_top_update_data_boolean(const struct starpu_top_data *data, int value);
+void starpu_top_update_data_integer(const struct starpu_top_data *data, int value);
+void starpu_top_update_data_float(const struct starpu_top_data *data, double value);
+void starpu_top_task_prevision(struct starpu_task *task, int devid, unsigned long long start, unsigned long long end);
 
 void starpu_top_debug_log(const char *message);
 void starpu_top_debug_lock(const char *message);

+ 28 - 33
mic-configure

@@ -1,12 +1,6 @@
 #!/bin/bash
 
 ROOT_DIR=$PWD
-[ -n "$STARPU_MIC_HOST" ] || STARPU_MIC_HOST=x86_64-k1om-linux
-[ -n "$STARPU_MIC_CC_PATH" ] || STARPU_MIC_CC_PATH=/usr/linux-k1om-4.7/bin/
-[ -n "$STARPU_COI_DIR" ] || STARPU_COI_DIR=/opt/intel/mic/coi
-DEFAULT_PREFIX=/usr/local
-
-export PATH=${STARPU_MIC_CC_PATH}${PATH:+:${PATH}}
 
 cat > ./mic-config.log << EOF
 This file was created by StarPU mic-configure
@@ -14,38 +8,39 @@ This file was created by StarPU mic-configure
  $ $0 $*
 EOF
 
-for arch in mic host
+prefix="/usr/local"
+coi_dir="/opt/intel/mic/coi"
+mic_host="x86_64-k1om-linux"
+
+for arg in $*
 do
+	case $arg in 
+		--prefix=*)
+			prefix="${arg#--prefix=}"
+			;;
+		--with-coi-dir=*)
+			coi_dir="${arg#--with-coi-dir=}"
+			;;
+		--mic-host=*)
+			mic_host="${arg#--mic-host=}"
+			;;
+	esac
+
+done
 
+for arch in mic host
+do
 	# We call the configure script from a build directory further in the
 	# arborescence
-	command="${ROOT_DIR}/configure --enable-mic --with-coi-dir=$STARPU_COI_DIR"
-	prefix_found=no
+
+	command="${ROOT_DIR}/configure"
+	params="--enable-mic --with-coi-dir=$coi_dir --prefix=$prefix/$arch"
 
 	if test x$arch = xmic ; then
-		command="$command --without-hwloc --with-coi-lib-dir=$STARPU_COI_DIR/device-linux-release/lib --host=$STARPU_MIC_HOST"
+		# TODO: fix hwloc detection to look for another pkg-config place, and not just believe in the host version of hwloc.pc...
+		params="$params --without-hwloc --with-coi-lib-dir=$coi_dir/device-linux-release/lib --host=$mic_host"
 	else
-		command="$command --with-coi-lib-dir=$STARPU_COI_DIR/host-linux-release/lib"
-	fi
-
-	for arg in $*
-	do
-		if [ ${arg:0:9} = '--prefix=' ]
-		then
-			prefix_found=yes
-			prefix="${arg:9}"
-			command="$command ${arg}/${arch}"
-		else
-			command="$command $arg"
-		fi
-
-	done
-
-	# If the user didn't specify a directory where to install the library
-	# we apply the default one
-	if test x$prefix_found = xno ; then
-		command="$command --prefix=${DEFAULT_PREFIX}/$arch"
-		prefix=${DEFAULT_PREFIX}
+		params="$params --with-coi-lib-dir=$coi_dir/host-linux-release/lib"
 	fi
 
 	# If the build directory doesn't exist yet, create it
@@ -56,9 +51,9 @@ do
 	cd "build_${arch}"
 
 	if test x$arch = xmic ; then
-		LDFLAGS=-export-dynamic $command
+		LDFLAGS=-export-dynamic $command $* $params
 	else
-		$command
+		$command $* $params
 	fi
 	if [ "$?" != 0 ]
 	then

+ 2 - 2
mpi/starpumpi-1.1.pc.in

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2011  Université de Bordeaux 1
+# Copyright (C) 2009-2011, 2013  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -25,5 +25,5 @@ Version: @PACKAGE_VERSION@
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
 Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
 Libs.private: @LDFLAGS@ @LIBS@
-Requires: starpu-1.0
+Requires: starpu-1.1
 Requires.private:

+ 2 - 2
mpi/starpumpi-1.2.pc.in

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2011  Université de Bordeaux 1
+# Copyright (C) 2009-2011, 2013  Université de Bordeaux 1
 # Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -25,5 +25,5 @@ Version: @PACKAGE_VERSION@
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@
 Libs: -L${libdir} -lstarpumpi-@STARPU_EFFECTIVE_VERSION@
 Libs.private: @LDFLAGS@ @LIBS@
-Requires: starpu-1.0
+Requires: starpu-1.2
 Requires.private:

+ 0 - 5
socl/Makefile.am

@@ -17,11 +17,6 @@ SUBDIRS = src examples
 
 EXTRA_DIST = README
 
-libsocl_la_includedir=$(includedir)/starpu/$(STARPU_EFFECTIVE_VERSION)/socl/CL
-
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA = socl-1.0.pc socl-1.1.pc
-
 showcheck:
 	for i in $(SUBDIRS) ; do \
 		make -C $$i showcheck ; \

+ 0 - 2
socl/README

@@ -3,5 +3,3 @@ StarPU's OpenCL interface
 
 This directory contains an OpenCL implementation that can
 be used as a replacement of the classic StarPU's API.
-
-OpenCL applications need to be compiled using provided headers.

+ 0 - 29
socl/socl-1.0.pc.in

@@ -1,29 +0,0 @@
-# StarPU --- Runtime system for heterogeneous multicore architectures.
-#
-# Copyright (C) 2009-2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
-#
-# StarPU is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
-#
-# StarPU is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
-
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: socl
-Description: offers OpenCL implementation on top of StarPU
-Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@/socl
-Libs: -L${libdir} -lsocl-@STARPU_EFFECTIVE_VERSION@
-Libs.private: @LDFLAGS@ @LIBS@
-Requires: starpu-1.0
-Requires.private:

+ 0 - 29
socl/socl-1.1.pc.in

@@ -1,29 +0,0 @@
-# StarPU --- Runtime system for heterogeneous multicore architectures.
-#
-# Copyright (C) 2009-2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
-#
-# StarPU is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
-#
-# StarPU is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
-
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: socl
-Description: offers OpenCL implementation on top of StarPU
-Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@/socl
-Libs: -L${libdir} -lsocl-@STARPU_EFFECTIVE_VERSION@
-Libs.private: @LDFLAGS@ @LIBS@
-Requires: starpu-1.0
-Requires.private:

+ 0 - 29
socl/socl-1.2.pc.in

@@ -1,29 +0,0 @@
-# StarPU --- Runtime system for heterogeneous multicore architectures.
-#
-# Copyright (C) 2009-2011  Université de Bordeaux 1
-# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
-#
-# StarPU is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
-#
-# StarPU is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
-
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: socl
-Description: offers OpenCL implementation on top of StarPU
-Version: @PACKAGE_VERSION@
-Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@/socl
-Libs: -L${libdir} -lsocl-@STARPU_EFFECTIVE_VERSION@
-Libs.private: @LDFLAGS@ @LIBS@
-Requires: starpu-1.0
-Requires.private:

+ 0 - 1
socl/vendors/socl.icd.in

@@ -1,2 +1 @@
 @STARPU_BUILD_DIR@/socl/src/.libs/libsocl-@STARPU_EFFECTIVE_VERSION@.so
-

+ 82 - 74
src/datawizard/memalloc.c

@@ -20,22 +20,24 @@
 #include <datawizard/footprint.h>
 #include <starpu.h>
 
-/* This per-node RW-locks protect mc_list and memchunk_cache entries */
-/* Note: handle header lock is always taken before this */
-static starpu_pthread_rwlock_t mc_rwlock[STARPU_MAXNODES];
-
 /* This per-node spinlock protect lru_list */
 static struct _starpu_spinlock lru_rwlock[STARPU_MAXNODES];
 
 /* Last Recently used memory chunkgs */
 static struct _starpu_mem_chunk_lru_list *starpu_lru_list[STARPU_MAXNODES];
 
+
+/* This per-node RW-locks protect mc_list and memchunk_cache entries */
+/* Note: handle header lock is always taken before this */
+static starpu_pthread_rwlock_t mc_rwlock[STARPU_MAXNODES];
+
 /* Potentially in use memory chunks */
 static struct _starpu_mem_chunk_list *mc_list[STARPU_MAXNODES];
 
 /* Explicitly caches memory chunks that can be reused */
 static struct _starpu_mem_chunk_list *memchunk_cache[STARPU_MAXNODES];
 
+
 /* When reclaiming memory to allocate, we reclaim MAX(what_is_to_reclaim_on_device, data_size_coefficient*data_size) */
 const unsigned starpu_memstrategy_data_size_coefficient=2;
 
@@ -73,22 +75,6 @@ void _starpu_deinit_mem_chunk_lists(void)
  *	Manipulate subtrees
  */
 
-static void lock_all_subtree(starpu_data_handle_t handle)
-{
-	unsigned child;
-
-	/* lock parent */
-	while (_starpu_spin_trylock(&handle->header_lock))
-		_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
-
-	/* lock all sub-subtrees children */
-	for (child = 0; child < handle->nchildren; child++)
-	{
-		starpu_data_handle_t child_handle = starpu_data_get_child(handle, child);
-		lock_all_subtree(child_handle);
-	}
-}
-
 static void unlock_all_subtree(starpu_data_handle_t handle)
 {
 	/* lock all sub-subtrees children
@@ -105,6 +91,30 @@ static void unlock_all_subtree(starpu_data_handle_t handle)
 	_starpu_spin_unlock(&handle->header_lock);
 }
 
+static int lock_all_subtree(starpu_data_handle_t handle)
+{
+	int child;
+
+	/* lock parent */
+	if (_starpu_spin_trylock(&handle->header_lock))
+		/* the handle is busy, abort */
+		return 0;
+
+	/* lock all sub-subtrees children */
+	for (child = 0; child < (int) handle->nchildren; child++)
+	{
+		if (!lock_all_subtree(starpu_data_get_child(handle, child))) {
+			/* Some child is busy, abort */
+			while (--child >= 0)
+				/* Unlock what we have already uselessly locked */
+				unlock_all_subtree(starpu_data_get_child(handle, child));
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
 static unsigned may_free_subtree(starpu_data_handle_t handle, unsigned node)
 {
 	/* we only free if no one refers to the leaf */
@@ -332,8 +342,9 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 	{
 		STARPU_ASSERT(mc->replicate);
 
-		while (_starpu_spin_trylock(&handle->header_lock))
-			_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
+		if (_starpu_spin_trylock(&handle->header_lock))
+			/* Handle is busy, abort */
+			return 0;
 
 		if (mc->replicate->refcnt == 0)
 		{
@@ -349,10 +360,8 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 	else
 	{
 		/* try to lock all the subtree */
-		lock_all_subtree(handle);
-	      
-		/* check if they are all "free" */
-		if (may_free_subtree(handle, node))
+		/* and check if they are all "free" */
+		if (lock_all_subtree(handle) && may_free_subtree(handle, node))
 		{
 			int target = -1;
 
@@ -381,10 +390,10 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 				/* now the actual buffer may be freed */
 				freed = do_free_mem_chunk(mc, node);
 			}
-		}
 
-		/* unlock the leafs */
-		unlock_all_subtree(handle);
+			/* unlock the tree */
+			unlock_all_subtree(handle);
+		}
 	}
 	return freed;
 }
@@ -439,10 +448,8 @@ static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned no
 	STARPU_ASSERT(old_data);
 
 	/* try to lock all the subtree */
-	lock_all_subtree(old_data);
-
-	/* check if they are all "free" */
-	if (may_free_subtree(old_data, node))
+	/* and check if they are all "free" */
+	if (lock_all_subtree(old_data) && may_free_subtree(old_data, node))
 	{
 		success = 1;
 
@@ -452,10 +459,10 @@ static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned no
 
 		/* now replace the previous data */
 		reuse_mem_chunk(node, replicate, mc, is_already_in_mc_list);
-	}
 
-	/* unlock the leafs */
-	unlock_all_subtree(old_data);
+		/* unlock the tree */
+		unlock_all_subtree(old_data);
+	}
 
 	return success;
 }
@@ -545,19 +552,27 @@ static unsigned try_to_find_reusable_mem_chunk(unsigned node, starpu_data_handle
 static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
 {
 	struct _starpu_mem_chunk *mc;
+	struct _starpu_mem_chunk_list *busy_memchunk_cache;
 
 	size_t freed = 0;
 
+	if (_starpu_mem_chunk_list_empty(memchunk_cache[node]))
+		return 0;
+
+	busy_memchunk_cache = _starpu_mem_chunk_list_new();
+
 	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
 	while (!_starpu_mem_chunk_list_empty(memchunk_cache[node])) {
 		mc = _starpu_mem_chunk_list_pop_front(memchunk_cache[node]);
-		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
-
 		starpu_data_handle_t handle = mc->data;
 
 		if (handle)
-			while (_starpu_spin_trylock(&handle->header_lock))
-				_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
+			if (_starpu_spin_trylock(&handle->header_lock)) {
+				/* The handle is still busy, leave this chunk for later */
+				_starpu_mem_chunk_list_push_front(busy_memchunk_cache, mc);
+				continue;
+			}
+
 		freed += free_memory_on_node(mc, node);
 		if (handle)
 			_starpu_spin_unlock(&handle->header_lock);
@@ -565,10 +580,11 @@ static size_t flush_memchunk_cache(unsigned node, size_t reclaim)
 		free(mc->chunk_interface);
 		_starpu_mem_chunk_delete(mc);
 
-		STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
-		if (reclaim && freed>reclaim)
+		if (reclaim && freed >= reclaim)
 			break;
 	}
+	_starpu_mem_chunk_list_push_list_front(busy_memchunk_cache, memchunk_cache[node]);
+	_starpu_mem_chunk_list_delete(busy_memchunk_cache);
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
 	return freed;
 }
@@ -583,7 +599,7 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 {
 	size_t freed = 0;
 
-	struct _starpu_mem_chunk *mc, *next_mc = (void*) -1;
+	struct _starpu_mem_chunk *mc, *next_mc;
 
 	/*
 	 * We have to unlock mc_rwlock before locking header_lock, so we have
@@ -593,50 +609,37 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 	 * finding anything to free.
 	 */
 
-	while (1)
-	{
-		STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
-
-		if (_starpu_mem_chunk_list_empty(mc_list[node]) || !next_mc)
-		{
-			STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
-			/* We reached the end of the list :/ */
-			break;
-		}
-
-		if (next_mc == (void*) -1) {
-			/* First iteration ever, start from beginning */
-			mc = _starpu_mem_chunk_list_begin(mc_list[node]);
-		} else {
-			/* Try to restart from where we were */
-			for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
-			     mc != _starpu_mem_chunk_list_end(mc_list[node]);
-			     mc = _starpu_mem_chunk_list_next(mc))
-				if (mc == next_mc)
-					/* Found it, restart from there.  */
-					break;
-
-			if (mc == _starpu_mem_chunk_list_end(mc_list[node]))
-				/* Couldn't find next_mc, restart from the beginning :/ */
-				mc = _starpu_mem_chunk_list_begin(mc_list[node]);
-		}
+restart:
+	STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
 
-		/* Remember where to try next */
+	for (mc = _starpu_mem_chunk_list_begin(mc_list[node]);
+	     mc != _starpu_mem_chunk_list_end(mc_list[node]);
+	     mc = next_mc)
+	{
+		/* mc hopefully gets out of the list, we thus need to prefetch
+		 * the next element */
 		next_mc = _starpu_mem_chunk_list_next(mc);
-		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
 
 		if (!force)
 		{
 			freed += try_to_free_mem_chunk(mc, node);
 
-			if (reclaim && freed > reclaim)
+			if (reclaim && freed >= reclaim)
 				break;
 		}
 		else
 		{
 			starpu_data_handle_t handle = mc->data;
 
-			_starpu_spin_lock(&handle->header_lock);
+			if (_starpu_spin_trylock(&handle->header_lock))
+			{
+				/* Ergl. We are shutting down, but somebody is
+				 * still locking the handle. That's not
+				 * supposed to happen, but better be safe by
+				 * letting it go through. */
+				STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+				goto restart;
+			}
 
 			/* We must free the memory now, because we are
 			 * terminating the drivers: note that data coherency is
@@ -646,6 +649,7 @@ static size_t free_potentially_in_use_mc(unsigned node, unsigned force, size_t r
 			_starpu_spin_unlock(&handle->header_lock);
 		}
 	}
+	STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
 
 	return freed;
 }
@@ -768,8 +772,12 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 		_starpu_mem_chunk_delete(mc);
 	}
 	else
+	{
 		/* put it in the list of buffers to be removed */
+		STARPU_PTHREAD_RWLOCK_WRLOCK(&mc_rwlock[node]);
 		_starpu_mem_chunk_list_push_front(memchunk_cache[node], mc);
+		STARPU_PTHREAD_RWLOCK_UNLOCK(&mc_rwlock[node]);
+	}
 }
 
 /*

+ 2 - 1
src/sched_policies/parallel_eager.c

@@ -179,7 +179,8 @@ static int push_task_peager_policy(struct starpu_task *task)
 		worker = workers->get_next(workers, &it);
 		int master = data->master_id[worker];
 		/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
-		if (starpu_worker_get_type(worker) != STARPU_CPU_WORKER  || master == worker)
+		if (!starpu_worker_is_combined_worker(worker) &&
+				starpu_worker_get_type(worker) != STARPU_CPU_WORKER  || master == worker)
 		{
 			starpu_pthread_mutex_t *sched_mutex;
 			starpu_pthread_cond_t *sched_cond;