4 yıl önce · 3d69cc6619
--- a/doc/doxygen/chapters/101_building.doxy
+++ b/doc/doxygen/chapters/101_building.doxy
@@ -46,7 +46,7 @@ recommended.  It allows for topology aware scheduling, which improves
 
				 performance. <c>hwloc</c> is available in major free operating system
			
 
				 distributions, and for most operating systems. Make sure to not only install
			
 
				 a <c>hwloc</c> or <c>libhwloc</c> package, but also <c>hwloc-devel</c> or
			
 
				-<c>libhwloc-dev</c> so as to have hwloc headers etc.
			
 
				+<c>libhwloc-dev</c> so as to have \c hwloc headers etc.
			
 
				 
			
 
				 If <c>libhwloc</c> is installed in a standard
			
 
				 location, no option is required, it will be detected automatically,
			
--- a/doc/doxygen/chapters/110_basic_examples.doxy
+++ b/doc/doxygen/chapters/110_basic_examples.doxy
@@ -214,7 +214,7 @@ Once a task has been executed, an optional callback function
 
				 starpu_task::callback_func is called when defined.
			
 
				 While the computational kernel could be offloaded on various architectures, the
			
 
				 callback function is always executed on a CPU. The pointer
			
 
				-starpu_task::callback_arg is passed as an argument of the callback
			
 
				+starpu_task::callback_arg is passed as an argument to the callback
			
 
				 function. The prototype of a callback function must be:
			
 
				 
			
 
				 \code{.c}
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -224,7 +224,7 @@ starpu_malloc_on_node() which are used by the data interfaces
 
				 (matrix, vector, etc.).  This does not include allocations performed by
			
 
				 the application through e.g. malloc(). It does not include allocations
			
 
				 performed through starpu_malloc() either, only allocations
			
 
				-performed explicitly with the \ref STARPU_MALLOC_COUNT flag, i.e. by calling
			
 
				+performed explicitly with the flag \ref STARPU_MALLOC_COUNT, i.e. by calling
			
 
				 
			
 
				 \code{.c}
			
 
				 starpu_malloc_flags(STARPU_MALLOC_COUNT)
			
@@ -258,18 +258,18 @@ structures of StarPU by describing the shape of your machine and/or your
 
				 application when calling \c configure.
			
 
				 
			
 
				 To reduce the memory footprint of the data internal structures of StarPU, one
			
 
				-can set the
			
 
				+can set the \c configure parameters 
			
 
				 \ref enable-maxcpus "--enable-maxcpus",
			
 
				 \ref enable-maxnumanodes "--enable-maxnumanodes",
			
 
				 \ref enable-maxcudadev "--enable-maxcudadev",
			
 
				 \ref enable-maxopencldev "--enable-maxopencldev" and
			
 
				 \ref enable-maxnodes "--enable-maxnodes"
			
 
				-\c configure parameters to give StarPU
			
 
				+to give StarPU
			
 
				 the architecture of the machine it will run on, thus tuning the size of the
			
 
				 structures to the machine.
			
 
				 
			
 
				 To reduce the memory footprint of the task internal structures of StarPU, one
			
 
				-can set the \ref enable-maxbuffers "--enable-maxbuffers" \c configure parameter to
			
 
				+can set the \c configure parameter \ref enable-maxbuffers "--enable-maxbuffers" to
			
 
				 give StarPU the maximum number of buffers that a task can use during an
			
 
				 execution. For example, in the Cholesky factorization (dense linear algebra
			
 
				 application), the GEMM task uses up to 3 buffers, so it is possible to set the
			
@@ -278,15 +278,15 @@ maximum number of task buffers to 3 to run a Cholesky factorization on StarPU.
 
				 The size of the various structures of StarPU can be printed by
			
 
				 <c>tests/microbenchs/display_structures_size</c>.
			
 
				 
			
 
				-It is also often useless to submit *all* the tasks at the same time.
			
 
				+It is also often useless to submit \b all the tasks at the same time.
			
 
				 Task submission can be blocked when a reasonable given number of
			
 
				 tasks have been submitted, by setting the environment variables \ref
			
 
				 STARPU_LIMIT_MIN_SUBMITTED_TASKS and \ref STARPU_LIMIT_MAX_SUBMITTED_TASKS.
			
 
				 
			
 
				-<c>
			
 
				+\code{.sh}
			
 
				 export STARPU_LIMIT_MAX_SUBMITTED_TASKS=10000
			
 
				 export STARPU_LIMIT_MIN_SUBMITTED_TASKS=9000
			
 
				-</c>
			
 
				+\endcode
			
 
				 
			
 
				 will make StarPU block submission when 10000 tasks are submitted, and unblock
			
 
				 submission when only 9000 tasks are still submitted, i.e. 1000 tasks have
			
@@ -341,10 +341,17 @@ overrides the hostname of the system.
 
				 
			
 
				 By default, StarPU stores separate performance models for each GPU. To avoid
			
 
				 having to calibrate performance models for each GPU of a homogeneous set of GPU
			
 
				-devices for instance, the model can be shared by setting
			
 
				-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_CUDA=1</c> (\ref STARPU_PERF_MODEL_HOMOGENEOUS_CUDA),
			
 
				-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL=1</c> (\ref STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL),
			
 
				-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS=1</c> (\ref STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS) depending on your GPU device type.
			
 
				+devices for instance, the model can be shared by using the environment
			
 
				+variables \ref STARPU_PERF_MODEL_HOMOGENEOUS_CUDA, \ref
			
 
				+STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL and \ref
			
 
				+STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS depending on your GPU device
			
 
				+type.
			
 
				+
			
 
				+\code{.shell}
			
 
				+export STARPU_PERF_MODEL_HOMOGENEOUS_CUDA=1
			
 
				+export STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL=1
			
 
				+export STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS=1
			
 
				+\endcode
			
 
				 
			
 
				 To force continuing calibration,
			
 
				 use <c>export STARPU_CALIBRATE=1</c> (\ref STARPU_CALIBRATE). This may be necessary if your application
			
@@ -452,12 +459,13 @@ and in Joules for the energy consumption models.
 
				 A quick view of how many tasks each worker has executed can be obtained by setting
			
 
				 <c>export STARPU_WORKER_STATS=1</c> (\ref STARPU_WORKER_STATS). This is a convenient way to check that
			
 
				 execution did happen on accelerators, without penalizing performance with
			
 
				-the profiling overhead. \ref STARPU_WORKER_STATS_FILE can be defined
			
 
				+the profiling overhead. The environment variable \ref STARPU_WORKER_STATS_FILE can be defined
			
 
				 to specify a filename in which to display statistics, by default
			
 
				 statistics are printed on the standard error stream.
			
 
				 
			
 
				 A quick view of how much data transfers have been issued can be obtained by setting
			
 
				-<c>export STARPU_BUS_STATS=1</c> (\ref STARPU_BUS_STATS). \ref
			
 
				+<c>export STARPU_BUS_STATS=1</c> (\ref STARPU_BUS_STATS). The
			
 
				+environment variable \ref
			
 
				 STARPU_BUS_STATS_FILE can be defined to specify a filename in which to
			
 
				 display statistics, by default statistics are printed on the standard error stream.
			
 
				 
			
--- a/doc/doxygen/chapters/301_tasks.doxy
+++ b/doc/doxygen/chapters/301_tasks.doxy
@@ -612,13 +612,13 @@ tasks or tags, or to be submitted in callbacks, etc.
 
				 The obvious way is of course to make kernel functions empty, but such task will
			
 
				 thus have to wait for a worker to become ready, transfer data, etc.
			
 
				 
			
 
				-A much lighter way to define a synchronization task is to set its starpu_task::cl
			
 
				-field to <c>NULL</c>. The task will thus be a mere synchronization point,
			
 
				+A much lighter way to define a synchronization task is to set its field starpu_task::cl
			
 
				+to <c>NULL</c>. The task will thus be a mere synchronization point,
			
 
				 without any data access or execution content: as soon as its dependencies become
			
 
				 available, it will terminate, call the callbacks, and release dependencies.
			
 
				 
			
 
				-An intermediate solution is to define a codelet with its
			
 
				-starpu_codelet::where field set to \ref STARPU_NOWHERE, for instance:
			
 
				+An intermediate solution is to define a codelet with its field
			
 
				+starpu_codelet::where set to \ref STARPU_NOWHERE, for instance:
			
 
				 
			
 
				 \code{.c}
			
 
				 struct starpu_codelet cl =
			
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -93,8 +93,8 @@ starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx,
 
				 
			
 
				 3D matrices can be partitioned along the x dimension by
			
 
				 using starpu_block_filter_block(), or along the y dimension
			
 
				-by using starpu_block_filter_vertical_block, or along the
			
 
				-z dimension by using starpu_block_filter_depth_block. They
			
 
				+by using starpu_block_filter_vertical_block(), or along the
			
 
				+z dimension by using starpu_block_filter_depth_block(). They
			
 
				 can also be partitioned with some overlapping by using
			
 
				 starpu_block_filter_block_shadow(), starpu_block_filter_vertical_block_shadow(),
			
 
				 or starpu_block_filter_depth_block_shadow().
			
@@ -206,23 +206,23 @@ need to be set within the original allocation. To reserve room for increasing
 
				 the NX/NY values, one can use starpu_matrix_data_register_allocsize() instead of
			
 
				 starpu_matrix_data_register(), to specify the allocation size to be used instead
			
 
				 of the default NX*NY*ELEMSIZE. To support this, the data interface
			
 
				-has to implement the starpu_data_interface_ops::alloc_footprint and
			
 
				-starpu_data_interface_ops::alloc_compare methods, for proper StarPU allocation
			
 
				+has to implement the functions starpu_data_interface_ops::alloc_footprint and
			
 
				+starpu_data_interface_ops::alloc_compare, for proper StarPU allocation
			
 
				 management.
			
 
				 
			
 
				 A more involved case is changing the amount of allocated data.
			
 
				 The task implementation can just reallocate the buffer during its execution, and
			
 
				 set the proper new values in the interface structure, e.g. nx, ny, ld, etc. so
			
 
				-that the StarPU core knows the new data layout. The starpu_data_interface_ops
			
 
				-structure however then needs to have the starpu_data_interface_ops::dontcache
			
 
				-field set to 1, to prevent StarPU from trying to perform any cached allocation,
			
 
				+that the StarPU core knows the new data layout. The structure starpu_data_interface_ops
			
 
				+however then needs to have the field starpu_data_interface_ops::dontcache
			
 
				+set to 1, to prevent StarPU from trying to perform any cached allocation,
			
 
				 since the allocated size will vary. An example is available in
			
 
				 <c>tests/datawizard/variable_size.c</c>. The example uses its own data
			
 
				 interface so as to contain some simulation information for data growth, but the
			
 
				 principle can be applied for any data interface.
			
 
				 
			
 
				-The principle is to use <c>starpu_malloc_on_node_flags</c> to make the new
			
 
				-allocation, and use <c>starpu_free_on_node_flags</c> to release any previous
			
 
				+The principle is to use starpu_malloc_on_node_flags() to make the new
			
 
				+allocation, and use starpu_free_on_node_flags() to release any previous
			
 
				 allocation. The flags have to be precisely like in the example:
			
 
				 
			
 
				 \code{.c}
			
@@ -244,7 +244,7 @@ stored in the data and the actually allocated buffer. For instance, the vector
 
				 interface uses the <c>nx</c> field for the former, and the <c>allocsize</c> for
			
 
				 the latter. This allows for lazy reallocation to avoid reallocating the buffer
			
 
				 everytime to exactly match the actual number of elements. Computations and data
			
 
				-transfers will use <c>nx</c> field, while allocation functions will use the
			
 
				+transfers will use the field <c>nx</c>, while allocation functions will use the field
			
 
				 <c>allocsize</c>. One just has to make sure that <c>allocsize</c> is always
			
 
				 bigger or equal to <c>nx</c>.
			
 
				 
			
@@ -254,12 +254,12 @@ Important note: one can not change the size of a partitioned data.
 
				 \section DataManagement Data Management
			
 
				 
			
 
				 When the application allocates data, whenever possible it should use
			
 
				-the starpu_malloc() function, which will ask CUDA or OpenCL to make
			
 
				-the allocation itself and pin the corresponding allocated memory, or to use the
			
 
				-starpu_memory_pin() function to pin memory allocated by other ways, such as local arrays. This
			
 
				+the function starpu_malloc(), which will ask CUDA or OpenCL to make
			
 
				+the allocation itself and pin the corresponding allocated memory, or to use the function
			
 
				+starpu_memory_pin() to pin memory allocated by other ways, such as local arrays. This
			
 
				 is needed to permit asynchronous data transfer, i.e. permit data
			
 
				 transfer to overlap with computations. Otherwise, the trace will show
			
 
				-that the <c>DriverCopyAsync</c> state takes a lot of time, this is
			
 
				+that the state <c>DriverCopyAsync</c> takes a lot of time, this is
			
 
				 because CUDA or OpenCL then reverts to synchronous transfers.
			
 
				 
			
 
				 The application can provide its own allocation function by calling
			
@@ -351,8 +351,8 @@ before submitting tasks, which will thus guide StarPU toward an initial task
 
				 distribution (since StarPU will try to avoid further transfers).
			
 
				 
			
 
				 This can be achieved by giving the function starpu_data_prefetch_on_node() the
			
 
				-handle and the desired target memory node. The
			
 
				-starpu_data_idle_prefetch_on_node() variant can be used to issue the transfer
			
 
				+handle and the desired target memory node. The variant
			
 
				+starpu_data_idle_prefetch_on_node() can be used to issue the transfer
			
 
				 only when the bus is idle.
			
 
				 
			
 
				 Conversely, one can advise StarPU that some data will not be useful in the
			
@@ -434,8 +434,8 @@ __kernel void opencl_kernel(__global int *vector, unsigned offset)
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-When the sub-data is not of the same type as the original data, the
			
 
				-starpu_data_filter::get_child_ops field needs to be set appropriately for StarPU
			
 
				+When the sub-data is not of the same type as the original data, the field
			
 
				+starpu_data_filter::get_child_ops needs to be set appropriately for StarPU
			
 
				 to know which type should be used.
			
 
				 
			
 
				 StarPU provides various interfaces and filters for matrices, vectors, etc.,
			
@@ -476,9 +476,8 @@ starpu_data_partition_plan(handle, &f_vert, vert_handle);
 
				 
			
 
				 starpu_data_partition_plan() returns the handles for the partition in <c>vert_handle</c>.
			
 
				 
			
 
				-One can then submit tasks working on the main handle, and tasks working on
			
 
				-<c>vert_handle</c> handles. Between using the main handle and <c>vert_handle</c>
			
 
				-handles, StarPU will automatically call starpu_data_partition_submit() and
			
 
				+One can then submit tasks working on the main handle, and tasks working on the handles
			
 
				+<c>vert_handle</c>. Between using the main handle and the handles <c>vert_handle</c>, StarPU will automatically call starpu_data_partition_submit() and
			
 
				 starpu_data_unpartition_submit().
			
 
				 
			
 
				 All this code is asynchronous, just submitting which tasks, partitioning and
			
@@ -544,7 +543,7 @@ ret = starpu_task_insert(&cl_switch, STARPU_RW, handle,
 
				 			0);
			
 
				 \endcode
			
 
				 
			
 
				-The execution of the <c>switch</c> task will get back the matrix data into the
			
 
				+The execution of the task <c>switch</c> will get back the matrix data into the
			
 
				 main memory, and thus the vertical slices will get the updated value there.
			
 
				 
			
 
				 Again, we prefer to make sure that we don't accidentally access the matrix through the whole-matrix handle:
			
@@ -557,7 +556,7 @@ And now we can start using vertical slices, etc.
 
				 
			
 
				 \section DataPointers Handles data buffer pointers
			
 
				 
			
 
				-A simple understanding of starpu handles is that it's a collection of buffers on
			
 
				+A simple understanding of StarPU handles is that it's a collection of buffers on
			
 
				 each memory node of the machine, which contain the same data.  The picture is
			
 
				 however made more complex with the OpenCL support and with partitioning.
			
 
				 
			
@@ -565,21 +564,28 @@ When partitioning a handle, the data buffers of the subhandles will indeed
 
				 be inside the data buffers of the main handle (to save transferring data
			
 
				 back and forth between the main handle and the subhandles). But in OpenCL,
			
 
				 a <c>cl_mem</c> is not a pointer, but an opaque value on which pointer
			
 
				-arithmetic can not be used. That is why data interfaces contain three members:
			
 
				-<c>dev_handle</c>, <c>offset</c>, and <c>ptr</c>. The <c>dev_handle</c> member
			
 
				-is what the allocation function returned, and one can not do arithmetic on
			
 
				-it. The <c>offset</c> member is the offset inside the allocated area, most often
			
 
				-it will be 0 because data start at the beginning of the allocated area, but
			
 
				-when the handle is partitioned, the subhandles will have varying <c>offset</c>
			
 
				-values, for each subpiece. The <c>ptr</c> member, in the non-OpenCL case, i.e.
			
 
				-when pointer arithmetic can be used on <c>dev_handle</c>, is just the sum of
			
 
				+arithmetic can not be used. That is why data interfaces contain three fields:
			
 
				+<c>dev_handle</c>, <c>offset</c>, and <c>ptr</c>.
			
 
				+<ul>
			
 
				+<li> The field <c>dev_handle</c> is what the allocation function
			
 
				+returned, and one can not do arithmetic on it.
			
 
				+</li>
			
 
				+<li> The field <c>offset</c> is the offset inside the allocated area,
			
 
				+most often it will be 0 because data start at the beginning of the
			
 
				+allocated area, but when the handle is partitioned, the subhandles
			
 
				+will have varying <c>offset</c> values, for each subpiece.
			
 
				+</li>
			
 
				+<li> The field <c>ptr</c>, in the non-OpenCL case, i.e. when pointer
			
 
				+arithmetic can be used on <c>dev_handle</c>, is just the sum of
			
 
				 <c>dev_handle</c> and <c>offset</c>, provided for convenience.
			
 
				+</li>
			
 
				+</ul>
			
 
				 
			
 
				 This means that:
			
 
				 <ul>
			
 
				 <li>computation kernels can use <c>ptr</c> in non-OpenCL implementations.</li>
			
 
				 <li>computation kernels have to use <c>dev_handle</c> and <c>offset</c> in the OpenCL implementation.</li>
			
 
				-<li>allocation methods of data interfaces have to store the value returned by starpu_malloc_on_node in <c>dev_handle</c> and <c>ptr</c>, and set <c>offset</c> to 0.</li>
			
 
				+<li>allocation methods of data interfaces have to store the value returned by starpu_malloc_on_node() in <c>dev_handle</c> and <c>ptr</c>, and set <c>offset</c> to 0.</li>
			
 
				 <li>partitioning filters have to copy over <c>dev_handle</c> without modifying it, set in the child different values of <c>offset</c>, and set <c>ptr</c> accordingly as the sum of <c>dev_handle</c> and <c>offset</c>.</li>
			
 
				 </ul>
			
 
				 
			
@@ -589,8 +595,8 @@ StarPU provides a series of predefined filters in \ref API_Data_Partition, but
 
				 additional filters can be defined by the application. The principle is that the
			
 
				 filter function just fills the memory location of the <c>i-th</c> subpart of a data.
			
 
				 Examples are provided in <c>src/datawizard/interfaces/*_filters.c</c>,
			
 
				-and see \ref starpu_data_filter::filter_func for the details.
			
 
				-The starpu_filter_nparts_compute_chunk_size_and_offset() helper can be used to
			
 
				+check \ref starpu_data_filter::filter_func for further details.
			
 
				+The helper function starpu_filter_nparts_compute_chunk_size_and_offset() can be used to
			
 
				 compute the division of pieces of data.
			
 
				 
			
 
				 \section DataReduction Data Reduction
			
@@ -709,7 +715,7 @@ of submission. In some applicative cases, the write contributions can actually
 
				 be performed in any order without affecting the eventual result. In this case
			
 
				 it is useful to drop the strictly sequential semantic, to improve parallelism
			
 
				 by allowing StarPU to reorder the write accesses. This can be done by using
			
 
				-the ::STARPU_COMMUTE data access flag. Accesses without this flag will however
			
 
				+the data access flag ::STARPU_COMMUTE. Accesses without this flag will however
			
 
				 properly be serialized against accesses with this flag. For instance:
			
 
				 
			
 
				 \code{.c}
			
@@ -739,7 +745,7 @@ already serialized, and thus by default StarPU uses the Dijkstra solution which
 
				 scales very well in terms of overhead: tasks will just acquire data one by one
			
 
				 by data handle pointer value order.
			
 
				 
			
 
				-When sequential ordering is disabled or the ::STARPU_COMMUTE flag is used, there
			
 
				+When sequential ordering is disabled or the flag ::STARPU_COMMUTE is used, there
			
 
				 may be a lot of concurrent accesses to the same data, and the Dijkstra solution
			
 
				 gets only poor parallelism, typically in some pathological cases which do happen
			
 
				 in various applications. In this case, one can use a data access arbiter, which
			
@@ -752,7 +758,7 @@ will acquire them arbiter by arbiter, in arbiter pointer value order.
 
				 
			
 
				 See the <c>tests/datawizard/test_arbiter.cpp</c> example.
			
 
				 
			
 
				-Arbiters however do not support the ::STARPU_REDUX flag yet.
			
 
				+Arbiters however do not support the flag ::STARPU_REDUX yet.
			
 
				 
			
 
				 \section TemporaryBuffers Temporary Buffers
			
 
				 
			
@@ -885,7 +891,7 @@ struct starpu_complex_interface
 
				 That structure stores enough to describe <b>one</b> buffer of such kind of
			
 
				 data. It is used for the buffer stored in the main memory, another instance
			
 
				 is used for the buffer stored in a GPU, etc. A <i>data handle</i> is thus a
			
 
				-collection of such structures, to remember each buffer on each memory node.
			
 
				+collection of such structures, to describe each buffer on each memory node.
			
 
				 
			
 
				 Note: one should not take pointers into such structures, because StarPU needs
			
 
				 to be able to copy over the content of it to various places, for instance to
			
@@ -921,8 +927,8 @@ void starpu_complex_data_register(starpu_data_handle_t *handleptr,
 
				 The <c>struct starpu_complex_interface complex</c> is here used just to store the
			
 
				 parameters that the user provided to <c>starpu_complex_data_register</c>.
			
 
				 starpu_data_register() will first allocate the handle, and
			
 
				-then pass the <c>starpu_complex_interface</c> structure to the
			
 
				-starpu_data_interface_ops::register_data_handle method, which records them
			
 
				+then pass the structure <c>starpu_complex_interface</c> to the method
			
 
				+starpu_data_interface_ops::register_data_handle, which records them
			
 
				 within the data handle (it is called once per node by starpu_data_register()):
			
 
				 
			
 
				 \code{.c}
			
@@ -975,7 +981,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 
				 \endcode
			
 
				 
			
 
				 Convenience functions can defined to access the different fields of the
			
 
				-complex interface from a StarPU data handle after a starpu_data_acquire() call:
			
 
				+complex interface from a StarPU data handle after a call to starpu_data_acquire():
			
 
				 
			
 
				 \code{.c}
			
 
				 double *starpu_complex_get_real(starpu_data_handle_t handle)
			
@@ -1029,7 +1035,7 @@ directory <c>examples/interface/</c>.
 
				 
			
 
				 To be able to run tasks on GPUs etc. StarPU needs to know how to allocate a
			
 
				 buffer for the interface. In our example, two allocations are needed in the
			
 
				-allocation complex_allocate_data_on_node() method: one for the real part and one
			
 
				+allocation method \c complex_allocate_data_on_node(): one for the real part and one
			
 
				 for the imaginary part.
			
 
				 
			
 
				 \code{.c}
			
@@ -1062,10 +1068,10 @@ fail_real:
 
				 \endcode
			
 
				 
			
 
				 Here we try to allocate the two parts. If either of them fails, we return
			
 
				--ENOMEM. If they succeed, we can record the obtained pointers and returned the
			
 
				+\c -ENOMEM. If they succeed, we can record the obtained pointers and returned the
			
 
				 amount of allocated memory (for memory usage accounting).
			
 
				 
			
 
				-Conversely, complex_free_data_on_node() frees the two parts:
			
 
				+Conversely, \c complex_free_data_on_node() frees the two parts:
			
 
				 
			
 
				 \code{.c}
			
 
				 static void complex_free_data_on_node(void *data_interface, unsigned node)
			
@@ -1085,7 +1091,7 @@ returns the resulting pointer, be it in main memory, in GPU memory, etc.
 
				 \subsection DefiningANewDataInterface_copy Data copy
			
 
				 
			
 
				 Now that StarPU knows how to allocate/free a buffer, it needs to be able to
			
 
				-copy over data into/from it. Defining a copy_any_to_any method allows StarPU to
			
 
				+copy over data into/from it. Defining a method \c copy_any_to_any() allows StarPU to
			
 
				 perform direct transfers between main memory and GPU memory.
			
 
				 
			
 
				 \code{.c}
			
@@ -1115,10 +1121,10 @@ static int copy_any_to_any(void *src_interface, unsigned src_node,
 
				 We here again have no idea what is main memory or GPU memory, or even if the
			
 
				 copy is synchronous or asynchronous: we just call starpu_interface_copy()
			
 
				 according to the interface, passing it the pointers, and checking whether it
			
 
				-returned -EAGAIN, which means the copy is asynchronous, and StarPU will
			
 
				-appropriately wait for it thanks to the \c async_data pointer.
			
 
				+returned \c -EAGAIN, which means the copy is asynchronous, and StarPU will
			
 
				+appropriately wait for it thanks to the pointer \c async_data.
			
 
				 
			
 
				-This copy method is referenced in a \ref starpu_data_copy_methods structure:
			
 
				+This copy method is referenced in a structure \ref starpu_data_copy_methods:
			
 
				 
			
 
				 \code{.c}
			
 
				 static const struct starpu_data_copy_methods complex_copy_methods =
			
@@ -1127,7 +1133,7 @@ static const struct starpu_data_copy_methods complex_copy_methods =
 
				 };
			
 
				 \endcode
			
 
				 
			
 
				-which was referenced in the \ref starpu_data_interface_ops structure above.
			
 
				+which was referenced in the structure \ref starpu_data_interface_ops above.
			
 
				 
			
 
				 Other fields of \ref starpu_data_copy_methods allow to provide optimized
			
 
				 variants, notably for the case of 2D or 3D matrix tiles with non-trivial ld.
			
@@ -1136,7 +1142,7 @@ variants, notably for the case of 2D or 3D matrix tiles with non-trivial ld.
 
				 
			
 
				 The copy methods allow for RAM/GPU transfers, but is not enough for e.g.
			
 
				 transferring over MPI. That requires defining the pack/peek/unpack methods. The
			
 
				-principle is that the starpu_data_interface_ops::pack_data method concatenates
			
 
				+principle is that the method starpu_data_interface_ops::pack_data concatenates
			
 
				 the buffer data into a newly-allocated contiguous bytes array, conversely
			
 
				 starpu_data_interface_ops::peek_data extracts from a bytes array into the
			
 
				 buffer data, and starpu_data_interface_ops::unpack_data does the same as
			
@@ -1164,7 +1170,7 @@ static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-complex_pack_data() first computes the size to be allocated, then allocates it,
			
 
				+\c complex_pack_data() first computes the size to be allocated, then allocates it,
			
 
				 and copies over into it the content of the two real and imaginary arrays.
			
 
				 
			
 
				 \code{.c}
			
@@ -1184,7 +1190,7 @@ static int complex_peek_data(starpu_data_handle_t handle, unsigned node, void *p
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-complex_peek_data() simply uses memcpy to copy over from the bytes array into the data buffer.
			
 
				+\c complex_peek_data() simply uses \c memcpy() to copy over from the bytes array into the data buffer.
			
 
				 
			
 
				 \code{.c}
			
 
				 static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
			
@@ -1197,7 +1203,7 @@ static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-And complex_unpack_data() just calls complex_peek_data() and releases the bytes array.
			
 
				+And \c complex_unpack_data() just calls \c complex_peek_data() and releases the bytes array.
			
 
				 
			
 
				 
			
 
				 \section SpecifyingATargetNode Specifying A Target Node For Task Data
			
@@ -1206,8 +1212,8 @@ When executing a task on a GPU for instance, StarPU would normally copy all the
 
				 needed data for the tasks on the embedded memory of the GPU.  It may however
			
 
				 happen that the task kernel would rather have some of the datas kept in the
			
 
				 main memory instead of copied in the GPU, a pivoting vector for instance.
			
 
				-This can be achieved by setting the starpu_codelet::specific_nodes flag to
			
 
				-<c>1</c>, and then fill the starpu_codelet::nodes array (or starpu_codelet::dyn_nodes when
			
 
				+This can be achieved by setting the flag starpu_codelet::specific_nodes to
			
 
				+<c>1</c>, and then fill the array starpu_codelet::nodes (or starpu_codelet::dyn_nodes when
			
 
				 starpu_codelet::nbuffers is greater than \ref STARPU_NMAXBUFS) with the node numbers
			
 
				 where data should be copied to, or ::STARPU_SPECIFIC_NODE_LOCAL to let
			
 
				 StarPU copy it to the memory node where the task will be executed.
			
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy