7 years ago · 6da076534a
--- a/doc/doxygen/chapters/000_introduction.doxy
+++ b/doc/doxygen/chapters/000_introduction.doxy
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2011-2013,2016                           Inria
			
 
				  * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
			
 
				  *
			
@@ -57,13 +57,13 @@ computing power of the available CPUs and GPUs, while relieving them
 
				 from the need to specially adapt their programs to the target machine
			
 
				 and processing units.
			
 
				 
			
 
				-At the core of StarPU is its run-time support library, which is
			
 
				+At the core of StarPU is its runtime support library, which is
			
 
				 responsible for scheduling application-provided tasks on heterogeneous
			
 
				 CPU/GPU machines.  In addition, StarPU comes with programming language
			
 
				 support, in the form of extensions to languages of the C family
			
 
				 (\ref cExtensions), as well as an OpenCL front-end (\ref SOCLOpenclExtensions).
			
 
				 
			
 
				-StarPU's run-time and programming language extensions support a
			
 
				+StarPU's runtime and programming language extensions support a
			
 
				 task-based programming model. Applications submit computational
			
 
				 tasks, with CPU and/or GPU implementations, and StarPU schedules these
			
 
				 tasks and associated data transfers on available CPUs and GPUs.  The
			
--- a/doc/doxygen/chapters/101_building.doxy
+++ b/doc/doxygen/chapters/101_building.doxy
@@ -48,13 +48,16 @@ recommended.  It allows for topology aware scheduling, which improves
 
				 performance.  <c>libhwloc</c> is available in major free operating system
			
 
				 distributions, and for most operating systems.
			
 
				 
			
 
				-If <c>libhwloc</c> is not available on your system, the option
			
 
				-\ref without-hwloc "--without-hwloc" should be explicitely given when calling the
			
 
				-<c>configure</c> script. If <c>libhwloc</c> is installed in a standard
			
 
				+If <c>libhwloc</c> is installed in a standard
			
 
				 location, no option is required, it will be detected automatically,
			
 
				 otherwise \ref with-hwloc "--with-hwloc=<directory>" should be used to specify its
			
 
				 location.
			
 
				 
			
 
				+If <c>libhwloc</c> is not available on your system, the option
			
 
				+\ref without-hwloc "--without-hwloc" should be explicitely given when calling the
			
 
				+<c>configure</c> script.
			
 
				+
			
 
				+
			
 
				 \subsection GettingSources Getting Sources
			
 
				 
			
 
				 StarPU's sources can be obtained from the download page of
			
@@ -74,8 +77,8 @@ $ wget http://starpu.gforge.inria.fr/testing/starpu-nightly-latest.tar.gz
 
				 \endverbatim
			
 
				 
			
 
				 And finally, current development version is also accessible via git.
			
 
				-It should be used only if you need the very latest changes (i.e. less
			
 
				-than a day!).
			
 
				+It should only be used if you need the very latest changes (i.e. less
			
 
				+than a day old!).
			
 
				 
			
 
				 \verbatim
			
 
				 $ git clone https://scm.gforge.inria.fr/anonscm/git/starpu/starpu.git
			
@@ -84,7 +87,7 @@ $ git clone https://scm.gforge.inria.fr/anonscm/git/starpu/starpu.git
 
				 \subsection ConfiguringStarPU Configuring StarPU
			
 
				 
			
 
				 Running <c>autogen.sh</c> is not necessary when using the tarball
			
 
				-releases of StarPU.  If you are using the source code from the git
			
 
				+releases of StarPU.  However when using the source code from the git
			
 
				 repository, you first need to generate the configure scripts and the
			
 
				 Makefiles. This requires the availability of <c>autoconf</c> and
			
 
				 <c>automake</c> >= 2.60.
			
@@ -108,7 +111,7 @@ By default, the files produced during the compilation are placed in
 
				 the source directory. As the compilation generates a lot of files, it
			
 
				 is advised to put them all in a separate directory. It is then
			
 
				 easier to cleanup, and this allows to compile several configurations
			
 
				-out of the same source tree. For that, simply enter the directory
			
 
				+out of the same source tree. To do so, simply enter the directory
			
 
				 where you want the compilation to produce its files, and invoke the
			
 
				 <c>configure</c> script located in the StarPU source directory.
			
 
				 
			
@@ -144,7 +147,7 @@ $ make check
 
				 
			
 
				 \subsection InstallingStarPU Installing StarPU
			
 
				 
			
 
				-In order to install StarPU at the location that was specified during
			
 
				+In order to install StarPU at the location which was specified during
			
 
				 configuration:
			
 
				 
			
 
				 \verbatim
			
@@ -165,8 +168,9 @@ StarPU may require to use specific flags or libraries (for instance
 
				 <c>CUDA</c> or <c>libspe2</c>).
			
 
				 
			
 
				 If StarPU was not installed at some standard location, the path of StarPU's
			
 
				-library must be specified in the environment variable <c>PKG_CONFIG_PATH</c> so
			
 
				-that <c>pkg-config</c> can find it. For example if StarPU was installed in
			
 
				+library must be specified in the environment variable
			
 
				+<c>PKG_CONFIG_PATH</c> to allow <c>pkg-config</c> to find it. For
			
 
				+example if StarPU was installed in
			
 
				 <c>$STARPU_PATH</c>:
			
 
				 
			
 
				 \verbatim
			
@@ -319,7 +323,7 @@ AFTER: First element is 3.140000
 
				 
			
 
				 When StarPU is used for the first time, the directory
			
 
				 <c>$STARPU_HOME/.starpu/</c> is created, performance models will be stored in
			
 
				-that directory (\ref STARPU_HOME).
			
 
				+this directory (\ref STARPU_HOME).
			
 
				 
			
 
				 Please note that buses are benchmarked when StarPU is launched for the
			
 
				 first time. This may take a few minutes, or less if <c>libhwloc</c> is
			
--- a/doc/doxygen/chapters/110_basic_examples.doxy
+++ b/doc/doxygen/chapters/110_basic_examples.doxy
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2013,2015-2017                      CNRS
			
 
				+ * Copyright (C) 2010-2013,2015-2018                      CNRS
			
 
				  * Copyright (C) 2011-2013                                Inria
			
 
				  * Copyright (C) 2009-2011,2014-2015                      Université de Bordeaux
			
 
				  *
			
@@ -129,7 +129,7 @@ If the field starpu_task::synchronous is non-zero, task submission
 
				 will be synchronous: the function starpu_task_submit() will not return
			
 
				 until the task has been executed. Note that the function starpu_shutdown()
			
 
				 does not guarantee that asynchronous tasks have been executed before
			
 
				-it returns, starpu_task_wait_for_all() can be used to that effect, or
			
 
				+it returns, starpu_task_wait_for_all() can be used to this effect, or
			
 
				 data can be unregistered (starpu_data_unregister()), which will
			
 
				 implicitly wait for all the tasks scheduled to work on it, unless
			
 
				 explicitly disabled thanks to
			
@@ -199,7 +199,7 @@ void cpu_func(void *buffers[], void *cl_arg)
 
				 \endcode
			
 
				 
			
 
				 As said before, the field starpu_codelet::nbuffers specifies the
			
 
				-number of data buffers that are manipulated by the codelet. It does
			
 
				+number of data buffers which are manipulated by the codelet. It does
			
 
				 not count the argument --- the parameter <c>cl_arg</c> of the function
			
 
				 <c>cpu_func</c> --- since it is not managed by our data management
			
 
				 library, but just contains trivial parameters.
			
@@ -312,7 +312,7 @@ struct starpu_codelet cl =
 
				 \endcode
			
 
				 
			
 
				 We create a codelet which may only be executed on the CPUs. The
			
 
				-optional field starpu_codelet::where is a bitmask that defines where
			
 
				+optional field starpu_codelet::where is a bitmask which defines where
			
 
				 the codelet may be executed. Here, the value ::STARPU_CPU means that
			
 
				 only CPUs can execute this codelet. When the optional field
			
 
				 starpu_codelet::where is unset, its value is automatically set based
			
@@ -372,7 +372,7 @@ require more typing.
 
				 <li>
			
 
				 It registers the memory pointed to by <c>vector</c>.  Eventually,
			
 
				 when OpenCL or CUDA task implementations are added, this will allow
			
 
				-StarPU to transfer that memory region between GPUs and the main memory.
			
 
				+StarPU to transfer the memory region between GPUs and the main memory.
			
 
				 Removing this <c>pragma</c> is an error.
			
 
				 </li>
			
 
				 <li>
			
@@ -407,7 +407,7 @@ advantage of these newfangled GPUs that your lab just bought, don't you?
 
				 So, let's add an OpenCL implementation of the task <c>vector_scal</c>.
			
 
				 We assume that the OpenCL kernel is available in a file,
			
 
				 <c>vector_scal_opencl_kernel.cl</c>, not shown here.  The OpenCL task
			
 
				-implementation is similar to that used with the standard C API
			
 
				+implementation is similar to the one used with the standard C API
			
 
				 (\ref DefinitionOfTheOpenCLKernel).  It is declared and defined
			
 
				 in our C file like this:
			
 
				 
			
@@ -470,7 +470,7 @@ starpu_opencl_load_opencl_from_file ("vector_scal_opencl_kernel.cl",
 
				 
			
 
				 And that's it.  The task <c>vector_scal</c> now has an additional
			
 
				 implementation, for OpenCL, which StarPU's scheduler may choose to use
			
 
				-at run-time.  Unfortunately, the <c>vector_scal_opencl</c> above still
			
 
				+at runtime.  Unfortunately, the <c>vector_scal_opencl</c> above still
			
 
				 has to go through the common OpenCL boilerplate; in the future,
			
 
				 additional extensions will automate most of it.
			
 
				 
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2013,2015,2017                      Inria
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2009-2011,2013-2017                      Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -26,20 +26,20 @@ performance, we give below a list of features which should be checked.
 
				 For a start, you can use \ref OfflinePerformanceTools to get a Gantt chart which
			
 
				 will show roughly where time is spent, and focus correspondingly.
			
 
				 
			
 
				-\section ConfigurationImprovePerformance Configuration That May Improve Performance
			
 
				+\section ConfigurationImprovePerformance Configuration Which May Improve Performance
			
 
				 
			
 
				 The \ref enable-fast "--enable-fast" configuration option disables all
			
 
				 assertions. This makes StarPU more performant for really small tasks by
			
 
				 disabling all sanity checks. Only use this for measurements and production, not for development, since this will drop all basic checks.
			
 
				 
			
 
				 
			
 
				-\section DataRelatedFeaturesToImprovePerformance Data Related Features That May Improve Performance
			
 
				+\section DataRelatedFeaturesToImprovePerformance Data Related Features Which May Improve Performance
			
 
				 
			
 
				 link to \ref DataManagement
			
 
				 
			
 
				 link to \ref DataPrefetch
			
 
				 
			
 
				-\section TaskRelatedFeaturesToImprovePerformance Task Related Features That May Improve Performance
			
 
				+\section TaskRelatedFeaturesToImprovePerformance Task Related Features Which May Improve Performance
			
 
				 
			
 
				 link to \ref TaskGranularity
			
 
				 
			
@@ -47,7 +47,7 @@ link to \ref TaskSubmission
 
				 
			
 
				 link to \ref TaskPriorities
			
 
				 
			
 
				-\section SchedulingRelatedFeaturesToImprovePerformance Scheduling Related Features That May Improve Performance
			
 
				+\section SchedulingRelatedFeaturesToImprovePerformance Scheduling Related Features Which May Improve Performance
			
 
				 
			
 
				 link to \ref TaskSchedulingPolicy
			
 
				 
			
@@ -72,7 +72,7 @@ cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
				 \endcode
			
 
				 
			
 
				 Unfortunately, some CUDA libraries do not have stream variants of
			
 
				-kernels. That will lower the potential for overlapping.
			
 
				+kernels. This will lower the potential for overlapping.
			
 
				 
			
 
				 Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
			
 
				 CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
			
@@ -87,7 +87,7 @@ queue CUSPARSE kernels with the proper configuration.
 
				 
			
 
				 If the kernel can be made to only use this local stream or other self-allocated
			
 
				 streams, i.e. the whole kernel submission can be made asynchronous, then
			
 
				-one should enable asynchronous execution of the kernel.  That means setting
			
 
				+one should enable asynchronous execution of the kernel.  This means setting
			
 
				 the flag ::STARPU_CUDA_ASYNC in the corresponding field starpu_codelet::cuda_flags, and dropping the
			
 
				 <c>cudaStreamSynchronize()</c> call at the end of the <c>cuda_func</c> function, so that it
			
 
				 returns immediately after having queued the kernel to the local stream. That way, StarPU will be
			
@@ -95,7 +95,7 @@ able to submit and complete data transfers while kernels are executing, instead
 
				 kernel submission. The kernel just has to make sure that StarPU can use the
			
 
				 local stream to synchronize with the kernel startup and completion.
			
 
				 
			
 
				-If the kernel uses its own non-default stream, one can synchronize that stream
			
 
				+If the kernel uses its own non-default stream, one can synchronize this stream
			
 
				 with the StarPU-provided stream this way:
			
 
				 
			
 
				 \code{.c}
			
@@ -107,7 +107,7 @@ cudaStreamWaitEvent(starpu_cuda_get_local_stream(), event, 0);
 
				 cudaEventDestroy(event);
			
 
				 \endcode
			
 
				 
			
 
				-That code makes the StarPU-provided stream wait for a new event, which will be
			
 
				+This code makes the StarPU-provided stream wait for a new event, which will be
			
 
				 triggered by the completion of the kernel.
			
 
				 
			
 
				 Using the flag ::STARPU_CUDA_ASYNC also permits to enable concurrent kernel
			
@@ -143,7 +143,7 @@ any task for 10ms, but lets the application continue normally. In addition to th
 
				 
			
 
				 <c>export STARPU_WATCHDOG_CRASH=1</c> (\ref STARPU_WATCHDOG_CRASH)
			
 
				 
			
 
				-raises <c>SIGABRT</c> in that condition, thus allowing to catch the situation in gdb.
			
 
				+raises <c>SIGABRT</c> in this condition, thus allowing to catch the situation in gdb.
			
 
				 It can also be useful to type <c>handle SIGABRT nopass</c> in <c>gdb</c> to be able to let
			
 
				 the process continue, after inspecting the state of the process.
			
 
				 
			
@@ -201,7 +201,7 @@ amount of memory becomes available, but it may be preferrable to call
 
				 starpu_memory_allocate(STARPU_MEMORY_WAIT)
			
 
				 \endcode
			
 
				 
			
 
				-to reserve that amount immediately.
			
 
				+to reserve this amount immediately.
			
 
				 
			
 
				 \section HowToReduceTheMemoryFootprintOfInternalDataStructures How To Reduce The Memory Footprint Of Internal Data Structures
			
 
				 
			
@@ -243,7 +243,7 @@ export STARPU_LIMIT_MIN_SUBMITTED_TASKS=9000
 
				 
			
 
				 To make StarPU block submission when 10000 tasks are submitted, and unblock
			
 
				 submission when only 9000 tasks are still submitted, i.e. 1000 tasks have
			
 
				-completed among the 10000 that were submitted when submission was blocked. Of
			
 
				+completed among the 10000 which were submitted when submission was blocked. Of
			
 
				 course this may reduce parallelism if the threshold is set too low. The precise
			
 
				 balance depends on the application task graph.
			
 
				 
			
@@ -255,7 +255,7 @@ setting the \ref STARPU_MAX_MEMORY_USE environment variable to <c>1</c>.
 
				 When your application needs to allocate more data than the available amount of
			
 
				 memory usable by StarPU (given by starpu_memory_get_available()), the
			
 
				 allocation cache system can reuse data buffers used by previously executed
			
 
				-tasks. For that system to work with MPI tasks, you need to submit tasks progressively instead
			
 
				+tasks. For this system to work with MPI tasks, you need to submit tasks progressively instead
			
 
				 of as soon as possible, because in the case of MPI receives, the allocation cache check for reusing data
			
 
				 buffers will be done at submission time, not at execution time.
			
 
				 
			
@@ -437,6 +437,6 @@ current directory, you can process it by running <c>gprof</c> on your applicatio
 
				 gprof ./test
			
 
				 \endcode
			
 
				 
			
 
				-That will dump an analysis of the time spent in StarPU functions.
			
 
				+This will dump an analysis of the time spent in StarPU functions.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/301_tasks.doxy
+++ b/doc/doxygen/chapters/301_tasks.doxy
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2009-2011,2014-2016                      Université de Bordeaux
			
 
				  * Copyright (C) 2011-2012,2018                           Inria
			
 
				  *
			
@@ -21,7 +21,7 @@
 
				 \section TaskGranularity Task Granularity
			
 
				 
			
 
				 Like any other runtime, StarPU has some overhead to manage tasks. Since
			
 
				-it does smart scheduling and data management, that overhead is not always
			
 
				+it does smart scheduling and data management, this overhead is not always
			
 
				 neglectable. The order of magnitude of the overhead is typically a couple of
			
 
				 microseconds, which is actually quite smaller than the CUDA overhead itself. The
			
 
				 amount of work that a task should do should thus be somewhat
			
@@ -154,7 +154,7 @@ struct starpu_codelet cl =
 
				 
			
 
				 Schedulers which are multi-implementation aware (only <c>dmda</c> and
			
 
				 <c>pheft</c> for now) will use the performance models of all the
			
 
				-implementations it was given, and pick the one that seems to be the fastest.
			
 
				+implementations it was given, and pick the one which seems to be the fastest.
			
 
				 
			
 
				 \section EnablingImplementationAccordingToCapabilities Enabling Implementation According To Capabilities
			
 
				 
			
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2009-2011,2014-2018                      Université de Bordeaux
			
 
				  * Copyright (C) 2011-2012                                Inria
			
 
				  *
			
@@ -160,7 +160,7 @@ because CUDA or OpenCL then reverts to synchronous transfers.
 
				 By default, StarPU leaves replicates of data wherever they were used, in case they
			
 
				 will be re-used by other tasks, thus saving the data transfer time. When some
			
 
				 task modifies some data, all the other replicates are invalidated, and only the
			
 
				-processing unit which ran that task will have a valid replicate of the data. If the application knows
			
 
				+processing unit which ran this task will have a valid replicate of the data. If the application knows
			
 
				 that this data will not be re-used by further tasks, it should advise StarPU to
			
 
				 immediately replicate it to a desired list of memory nodes (given through a
			
 
				 bitmask). This can be understood like the write-through mode of CPU caches.
			
@@ -188,7 +188,7 @@ of tasks access the same piece of data. If no dependency is required
 
				 on some piece of data (e.g. because it is only accessed in read-only
			
 
				 mode, or because write accesses are actually commutative), use the
			
 
				 function starpu_data_set_sequential_consistency_flag() to disable
			
 
				-implicit dependencies on that data.
			
 
				+implicit dependencies on this data.
			
 
				 
			
 
				 In the same vein, accumulation of results in the same data can become a
			
 
				 bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
			
@@ -461,7 +461,7 @@ whole machine, it would not be efficient to accumulate them in only one place,
 
				 incurring data transmission each and access concurrency.
			
 
				 
			
 
				 StarPU provides a mode ::STARPU_REDUX, which permits to optimize
			
 
				-that case: it will allocate a buffer on each memory node, and accumulate
			
 
				+this case: it will allocate a buffer on each memory node, and accumulate
			
 
				 intermediate results there. When the data is eventually accessed in the normal
			
 
				 mode ::STARPU_R, StarPU will collect the intermediate results in just one
			
 
				 buffer.
			
@@ -542,9 +542,9 @@ The example <c>cg</c> also uses reduction for the blocked gemv kernel,
 
				 leading to yet more relaxed dependencies and more parallelism.
			
 
				 
			
 
				 ::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
			
 
				-case. That will however not produce any MPI communication, but just pass
			
 
				+case. This will however not produce any MPI communication, but just pass
			
 
				 ::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
			
 
				-application to call starpu_mpi_redux_data(), which posts tasks that will
			
 
				+application to call starpu_mpi_redux_data(), which posts tasks which will
			
 
				 reduce the partial results among MPI nodes into the MPI node which owns the
			
 
				 data. For instance, some hypothetical application which collects partial results
			
 
				 into data <c>res</c>, then uses it for other computation, before looping again
			
@@ -566,7 +566,7 @@ for (i = 0; i < 100; i++)
 
				 By default, the implicit dependencies computed from data access use the
			
 
				 sequential semantic. Notably, write accesses are always serialized in the order
			
 
				 of submission. In some applicative cases, the write contributions can actually
			
 
				-be performed in any order without affecting the eventual result. In that case
			
 
				+be performed in any order without affecting the eventual result. In this case
			
 
				 it is useful to drop the strictly sequential semantic, to improve parallelism
			
 
				 by allowing StarPU to reorder the write accesses. This can be done by using
			
 
				 the ::STARPU_COMMUTE data access flag. Accesses without this flag will however
			
@@ -614,7 +614,7 @@ by data handle pointer value order.
 
				 When sequential ordering is disabled or the ::STARPU_COMMUTE flag is used, there
			
 
				 may be a lot of concurrent accesses to the same data, and the Dijkstra solution
			
 
				 gets only poor parallelism, typically in some pathological cases which do happen
			
 
				-in various applications. In that case, one can use a data access arbiter, which
			
 
				+in various applications. In this case, one can use a data access arbiter, which
			
 
				 implements the classical centralized solution for the Dining Philosophers
			
 
				 problem. This is more expensive in terms of overhead since it is centralized,
			
 
				 but it opportunistically gets a lot of parallelism. The centralization can also
			
@@ -641,7 +641,7 @@ the special memory node number <c>-1</c>, and passing a zero pointer. StarPU wil
 
				 actually allocate memory only when the task creating the content gets scheduled,
			
 
				 and destroy it on unregistration.
			
 
				 
			
 
				-In addition to that, it can be tedious for the application to have to unregister
			
 
				+In addition to this, it can be tedious for the application to have to unregister
			
 
				 the data, since it will not use its content anyway. The unregistration can be
			
 
				 done lazily by using the function starpu_data_unregister_submit(),
			
 
				 which will record that no more tasks accessing the handle will be submitted, so
			
@@ -668,9 +668,9 @@ codelet is needed).
 
				 
			
 
				 Some kernels sometimes need temporary data to achieve the computations, i.e. a
			
 
				 workspace. The application could allocate it at the start of the codelet
			
 
				-function, and free it at the end, but that would be costly. It could also
			
 
				+function, and free it at the end, but this would be costly. It could also
			
 
				 allocate one buffer per worker (similarly to \ref HowToInitializeAComputationLibraryOnceForEachWorker),
			
 
				-but that would
			
 
				+but this would
			
 
				 make them systematic and permanent. A more  optimized way is to use
			
 
				 the data access mode ::STARPU_SCRATCH, as examplified below, which
			
 
				 provides per-worker buffers without content consistency. The buffer is
			
@@ -697,8 +697,8 @@ The example <c>examples/pi</c> uses scratches for some temporary buffer.
 
				 \section TheMultiformatInterface The Multiformat Interface
			
 
				 
			
 
				 It may be interesting to represent the same piece of data using two different
			
 
				-data structures: one that would only be used on CPUs, and one that would only
			
 
				-be used on GPUs. This can be done by using the multiformat interface. StarPU
			
 
				+data structures: one only used on CPUs, and one only used on GPUs.
			
 
				+This can be done by using the multiformat interface. StarPU
			
 
				 will be able to convert data from one data structure to the other when needed.
			
 
				 Note that the scheduler <c>dmda</c> is the only one optimized for this
			
 
				 interface. The user must provide StarPU with conversion codelets:
			
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2011-2012,2016                           Inria
			
 
				  * Copyright (C) 2009-2011,2014-2018                      Université de Bordeaux
			
 
				  *
			
@@ -20,7 +20,7 @@
 
				 
			
 
				 \section TaskSchedulingPolicy Task Scheduling Policies
			
 
				 
			
 
				-The basics of the scheduling policy are that:
			
 
				+The basics of the scheduling policy are the following:
			
 
				 
			
 
				 <ul>
			
 
				 <li>The scheduler gets to schedule tasks (<c>push</c> operation) when they become
			
@@ -80,7 +80,7 @@ using an <b>eager</b> fallback policy.
 
				 
			
 
				 <b>Troubleshooting:</b> Configuring and recompiling StarPU using the
			
 
				 <c>--enable-verbose</c> configure flag displays some statistics at the end of
			
 
				-execution about the percentage of tasks that have been scheduled by a DM*
			
 
				+execution about the percentage of tasks which have been scheduled by a DM*
			
 
				 family policy using performance model hints. A low or zero percentage may be
			
 
				 the sign that performance models are not converging or that codelets do not
			
 
				 have performance models enabled.
			
@@ -173,7 +173,7 @@ consumption for that series of tasks, divide it by a thousand, repeat for
 
				 varying kinds of tasks and task sizes, and eventually feed StarPU
			
 
				 with these manual measurements through starpu_perfmodel_update_history().
			
 
				 For instance, for CUDA devices, <c>nvidia-smi -q -d POWER</c> can be used to get
			
 
				-the current consumption in Watt. Multiplying that value by the average duration
			
 
				+the current consumption in Watt. Multiplying this value by the average duration
			
 
				 of a single task gives the consumption of the task in Joules, which can be given
			
 
				 to starpu_perfmodel_update_history().
			
 
				 
			
@@ -243,7 +243,7 @@ starpu_sched_policy::push_task method is called. When a worker is idle, the
 
				 starpu_sched_policy::pop_task method is called to get a task. It is up to the
			
 
				 scheduler to implement what is between. A simple eager scheduler is for instance
			
 
				 to make starpu_sched_policy::push_task push the task to a global list, and make
			
 
				-starpu_sched_policy::pop_task pop from that list.
			
 
				+starpu_sched_policy::pop_task pop from this list.
			
 
				 
			
 
				 The \ref starpu_sched_policy section provides the exact rules that govern the
			
 
				 methods of the policy.
			
@@ -280,7 +280,7 @@ instance <c>random_policy.c</c>, <c>eager_central_policy.c</c>,
 
				 
			
 
				 For performance reasons, most of the schedulers shipped with StarPU use simple
			
 
				 list-scheduling heuristics, assuming that the application has already set
			
 
				-priorities.  That is why they do their scheduling between when tasks become
			
 
				+priorities.  This is why they do their scheduling between when tasks become
			
 
				 available for execution and when a worker becomes idle, without looking at the
			
 
				 task graph.
			
 
				 
			
@@ -315,10 +315,10 @@ Precise debugging can also be performed by using the
 
				 \ref STARPU_TASK_BREAK_ON_POP, and \ref STARPU_TASK_BREAK_ON_EXEC environment variables.
			
 
				 By setting the job_id of a task
			
 
				 in these environment variables, StarPU will raise <c>SIGTRAP</c> when the task is being
			
 
				-scheduled, pushed, or popped by the scheduler. That means that when one notices
			
 
				+scheduled, pushed, or popped by the scheduler. This means that when one notices
			
 
				 that a task is being scheduled in a seemingly odd way, one can just reexecute
			
 
				 the application in a debugger, with some of those variables set, and the
			
 
				-execution will stop exactly at the scheduling points of that task, thus allowing
			
 
				+execution will stop exactly at the scheduling points of this task, thus allowing
			
 
				 to inspect the scheduler state, etc.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2018                                CNRS
			
 
				  * Copyright (C) 2011-2013,2017                           Inria
			
 
				  * Copyright (C) 2009-2011,2013-2018                      Université de Bordeaux
			
 
				  *
			
@@ -393,7 +393,7 @@ void starpu_complex_interface_datatype_free(MPI_Datatype *mpi_datatype)
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-Note that it is important to make sure no communication is going to occur before the function starpu_mpi_datatype_register() is called. That would produce an undefined result as the data may be received before the function is called, and so the MPI datatype would not be known by the StarPU-MPI communication engine, and the data would be processed with the pack and unpack operations.
			
 
				+Note that it is important to make sure no communication is going to occur before the function starpu_mpi_datatype_register() is called. This would produce an undefined result as the data may be received before the function is called, and so the MPI datatype would not be known by the StarPU-MPI communication engine, and the data would be processed with the pack and unpack operations.
			
 
				 
			
 
				 \code{.c}
			
 
				 starpu_data_interface handle;
			
@@ -408,7 +408,7 @@ starpu_mpi_barrier(MPI_COMM_WORLD);
 
				 To save the programmer from having to explicit all communications, StarPU
			
 
				 provides an "MPI Insert Task Utility". The principe is that the application
			
 
				 decides a distribution of the data over the MPI nodes by allocating it and
			
 
				-notifying StarPU of that decision, i.e. tell StarPU which MPI node "owns"
			
 
				+notifying StarPU of this decision, i.e. tell StarPU which MPI node "owns"
			
 
				 which data. It also decides, for each handle, an MPI tag which will be used to
			
 
				 exchange the content of the handle. All MPI nodes then process the whole task
			
 
				 graph, and StarPU automatically determines which node actually execute which
			
@@ -465,7 +465,7 @@ data which will be needed by the tasks that we will execute.
 
				                                               (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
			
 
				             else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
			
 
				                   || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
			
 
				-                /* I don't own that index, but will need it for my computations */
			
 
				+                /* I don't own this index, but will need it for my computations */
			
 
				                 starpu_variable_data_register(&data_handles[x][y], -1,
			
 
				                                               (uintptr_t)NULL, sizeof(unsigned));
			
 
				             else
			
@@ -511,7 +511,7 @@ A function starpu_mpi_task_build() is also provided with the aim to
 
				 only construct the task structure. All MPI nodes need to call the
			
 
				 function, which posts the required send/recv on the various nodes which have to.
			
 
				 Only the node which is to execute the task will then return a
			
 
				-valid task structure, others will return <c>NULL</c>. That node must submit that task.
			
 
				+valid task structure, others will return <c>NULL</c>. This node must submit the task.
			
 
				 All nodes then need to call the function starpu_mpi_task_post_build() -- with the same
			
 
				 list of arguments as starpu_mpi_task_build() -- to post all the
			
 
				 necessary data communications meant to happen after the task execution.
			
@@ -595,7 +595,7 @@ which tag is used to transfer its value.
 
				 
			
 
				 It can however be useful to register e.g. some temporary data on just one node,
			
 
				 without having to register a dumb handle on all nodes, while only one node will
			
 
				-actually need to know about it. In that case, nodes which will not need the data
			
 
				+actually need to know about it. In this case, nodes which will not need the data
			
 
				 can just pass NULL to starpu_mpi_task_insert():
			
 
				 
			
 
				 \code{.c}
			
@@ -711,7 +711,7 @@ the cholesky example in <c>mpi/examples/matrix_decomposition</c>), or at least n
 
				 the close future. If a newly-submitted task actually needs the value again,
			
 
				 another transmission of D will be initiated from A to B.  A mere
			
 
				 starpu_mpi_cache_flush_all_data() can for instance be added at the end of the whole
			
 
				-algorithm, to express that no data will be reused after that (or at least that
			
 
				+algorithm, to express that no data will be reused after this (or at least that
			
 
				 it is not interesting to keep them in cache).  It may however be interesting to
			
 
				 add fine-graph starpu_mpi_cache_flush() calls during the algorithm; the effect
			
 
				 for the data deallocation will be the same, but it will additionally release some
			
@@ -731,7 +731,7 @@ The application can dynamically change its mind about the data distribution, to
 
				 balance the load over MPI nodes for instance. This can be done very simply by
			
 
				 requesting an explicit move and then change the registered rank. For instance,
			
 
				 we here switch to a new distribution function <c>my_distrib2</c>: we first
			
 
				-register any data that wasn't registered already and will be needed, then
			
 
				+register any data which wasn't registered already and will be needed, then
			
 
				 migrate the data, and register the new location.
			
 
				 
			
 
				 \code{.c}
			
@@ -813,7 +813,7 @@ for(x = 0; x < nblocks ;  x++)
 
				     }
			
 
				     else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1)))
			
 
				     {
			
 
				-        /* I own that index, or i will need it for my computations */
			
 
				+        /* I own this index, or i will need it for my computations */
			
 
				         starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL,
			
 
				                                    block_size, sizeof(float));
			
 
				     }
			
--- a/doc/doxygen/foreword.html
+++ b/doc/doxygen/foreword.html
@@ -1,6 +1,6 @@
 
				 <br/>
			
 
				 <br/>
			
 
				-Copyright &copy; 2009–2018 Université de Bordeaux
			
 
				+Copyright &copy; 2009-2018 Université de Bordeaux
			
 
				 <br/>
			
 
				 Copyright &copy; 2010-2018 CNRS
			
 
				 <br/>
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2015,2017                           CNRS
			
 
				+ * Copyright (C) 2010-2015,2017,2018                           CNRS
			
 
				  * Copyright (C) 2013                                     Inria
			
 
				  * Copyright (C) 2009-2010,2014-2015,2017                 Université de Bordeaux
			
 
				  *
			
@@ -98,7 +98,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 #endif
			
 
				 			else
			
 
				 			{
			
 
				-				/* I don't own that index, but will need it for my computations */
			
 
				+				/* I don't own this index, but will need it for my computations */
			
 
				 				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
			
 
				 				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
			
 
				 						ld, size/nblocks, size/nblocks, sizeof(float));
			
--- a/mpi/examples/stencil/stencil5.c
+++ b/mpi/examples/stencil/stencil5.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2012-2013                                Inria
			
 
				- * Copyright (C) 2011-2017                                CNRS
			
 
				+ * Copyright (C) 2011-2018                                CNRS
			
 
				  * Copyright (C) 2011-2017                                Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -158,7 +158,7 @@ int main(int argc, char **argv)
 
				 			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
			
 
				 				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
			
 
				 			{
			
 
				-				/* I don't own that index, but will need it for my computations */
			
 
				+				/* I don't own this index, but will need it for my computations */
			
 
				 				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
			
 
				 				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(float));
			
 
				 			}
			
--- a/mpi/examples/stencil/stencil5_lb.c
+++ b/mpi/examples/stencil/stencil5_lb.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011-2017                                CNRS
			
 
				+ * Copyright (C) 2011-2018                                CNRS
			
 
				  * Copyright (C) 2011,2013,2015-2017                      Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -222,7 +222,7 @@ int main(int argc, char **argv)
 
				 			else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
			
 
				 				 || my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
			
 
				 			{
			
 
				-				/* I don't own that index, but will need it for my computations */
			
 
				+				/* I don't own this index, but will need it for my computations */
			
 
				 				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
			
 
				 				starpu_variable_data_register(&data_nodes[x][y].data_handle, -1, (uintptr_t)NULL, sizeof(float));
			
 
				 			}
			
--- a/mpi/tests/insert_task.c
+++ b/mpi/tests/insert_task.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011-2015,2017                           CNRS
			
 
				+ * Copyright (C) 2011-2015,2017,2018                           CNRS
			
 
				  * Copyright (C) 2013                                     Inria
			
 
				  * Copyright (C) 2011,2013-2015,2017                      Université de Bordeaux
			
 
				  *
			
@@ -94,7 +94,7 @@ int main(int argc, char **argv)
 
				 			}
			
 
				 			else
			
 
				 			{
			
 
				-				/* I don't own that index, but will need it for my computations */
			
 
				+				/* I don't own this index, but will need it for my computations */
			
 
				 				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
			
 
				 				starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				 			}
			
--- a/mpi/tests/insert_task_block.c
+++ b/mpi/tests/insert_task_block.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011-2015,2017                           CNRS
			
 
				+ * Copyright (C) 2011-2015,2017,2018                           CNRS
			
 
				  * Copyright (C) 2013                                     Inria
			
 
				  * Copyright (C) 2013-2015,2017                           Université de Bordeaux
			
 
				  *
			
@@ -114,7 +114,7 @@ int main(int argc, char **argv)
 
				 			}
			
 
				 			else
			
 
				 			{
			
 
				-				/* I don't own that index, but will need it for my computations */
			
 
				+				/* I don't own this index, but will need it for my computations */
			
 
				 				//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
			
 
				 				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)&(matrix[((SIZE/BLOCKS)*x) + ((SIZE/BLOCKS)*y) * SIZE]),
			
 
				 							    SIZE, SIZE/BLOCKS, SIZE/BLOCKS, sizeof(unsigned));
			
--- a/mpi/tests/insert_task_recv_cache.c
+++ b/mpi/tests/insert_task_recv_cache.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011-2017                                CNRS
			
 
				+ * Copyright (C) 2011-2018                                CNRS
			
 
				  * Copyright (C) 2014-2015,2017                           Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -84,7 +84,7 @@ void test_cache(int rank, char *enabled, size_t *comm_amount)
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			/* I don't own that index, but will need it for my computations */
			
 
				+			/* I don't own this index, but will need it for my computations */
			
 
				 			starpu_vector_data_register(&data_handles[i], -1, (uintptr_t)NULL, N, sizeof(unsigned));
			
 
				 		}
			
 
				 		starpu_mpi_data_register(data_handles[i], i, mpi_rank);
			
--- a/mpi/tests/insert_task_sent_cache.c
+++ b/mpi/tests/insert_task_sent_cache.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011-2017                                CNRS
			
 
				+ * Copyright (C) 2011-2018                                CNRS
			
 
				  * Copyright (C) 2013                                     Inria
			
 
				  * Copyright (C) 2013-2015,2017                           Université de Bordeaux
			
 
				  *
			
@@ -85,7 +85,7 @@ void test_cache(int rank, char *enabled, size_t *comm_amount)
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			/* I don't own that index, but will need it for my computations */
			
 
				+			/* I don't own this index, but will need it for my computations */
			
 
				 			//FPRINTF(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
			
 
				 			starpu_vector_data_register(&data_handles[i], -1, (uintptr_t)NULL, N, sizeof(unsigned));
			
 
				 		}
			
--- a/mpi/tests/insert_task_seq.c
+++ b/mpi/tests/insert_task_seq.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011-2014,2017                           CNRS
			
 
				+ * Copyright (C) 2011-2014,2017,2018                           CNRS
			
 
				  * Copyright (C) 2017                                     Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -87,7 +87,7 @@ void dotest(int rank, int size, char *enabled)
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			/* I don't own that index, but will need it for my computations */
			
 
				+			/* I don't own this index, but will need it for my computations */
			
 
				 			starpu_variable_data_register(&data_handles[x], -1, (uintptr_t)NULL, sizeof(unsigned));
			
 
				 		}
			
 
				 		if (data_handles[x])
			
--- a/mpi/tests/mpi_scatter_gather.c
+++ b/mpi/tests/mpi_scatter_gather.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2012-2013                                Inria
			
 
				- * Copyright (C) 2011-2017                                CNRS
			
 
				+ * Copyright (C) 2011-2018                                CNRS
			
 
				  * Copyright (C) 2013-2015,2017                           Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -112,7 +112,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 		else if (mpi_rank == rank)
			
 
				 		{
			
 
				-			/* I do not own that index but i will need it for my computations */
			
 
				+			/* I do not own this index but i will need it for my computations */
			
 
				 			starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL, 1, sizeof(int));
			
 
				 		}
			
 
				 		else