6 年之前 · 2f7184f802
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -155,7 +155,7 @@ chapters/version.sty: $(chapters)
 
				                 if test -f $(top_srcdir)/doc/doxygen/$$f ; then $(PROG_STAT) --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
			
 
				         done | sort -r | head -1 > timestamp_sty
			
 
				 	@if test -s timestamp_sty ; then \
			
 
				-		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_sty` +"%d %B %Y" > timestamp_sty_updated ;\
			
 
				+		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_sty` +"%F" > timestamp_sty_updated ;\
			
 
				 		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_sty` +"%B %Y" > timestamp_sty_updated_month ;\
			
 
				 	fi
			
 
				 	@if test -s timestamp_sty_updated ; then \
			
@@ -174,7 +174,7 @@ chapters/version.html: $(chapters)
 
				                 if test -f $(top_srcdir)/doc/doxygen/$$f ; then $(PROG_STAT) --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
			
 
				         done | sort -r | head -1 > timestamp_html
			
 
				 	@if test -s timestamp_html ; then \
			
 
				-		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_html` +"%d %B %Y" > timestamp_html_updated ;\
			
 
				+		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_html` +"%F" > timestamp_html_updated ;\
			
 
				 		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_html` +"%B %Y" > timestamp_html_updated_month ;\
			
 
				 	fi
			
 
				 	@echo "This manual documents the usage of StarPU version $(VERSION)." > $(top_srcdir)/doc/doxygen/chapters/version.html
			
--- a/doc/doxygen/chapters/101_building.doxy
+++ b/doc/doxygen/chapters/101_building.doxy
@@ -55,7 +55,7 @@ location.
 
				 
			
 
				 If <c>libhwloc</c> is not available on your system, the option
			
 
				 \ref without-hwloc "--without-hwloc" should be explicitely given when calling the
			
 
				-<c>configure</c> script.
			
 
				+script <c>configure</c>.
			
 
				 
			
 
				 
			
 
				 \subsection GettingSources Getting Sources
			
@@ -88,8 +88,8 @@ $ git clone https://scm.gforge.inria.fr/anonscm/git/starpu/starpu.git
 
				 
			
 
				 Running <c>autogen.sh</c> is not necessary when using the tarball
			
 
				 releases of StarPU.  However when using the source code from the git
			
 
				-repository, you first need to generate the configure scripts and the
			
 
				-Makefiles. This requires the availability of <c>autoconf</c> and
			
 
				+repository, you first need to generate the script <c>configure</c> and the
			
 
				+different Makefiles. This requires the availability of <c>autoconf</c> and
			
 
				 <c>automake</c> >= 2.60.
			
 
				 
			
 
				 \verbatim
			
@@ -113,7 +113,7 @@ is advised to put them all in a separate directory. It is then
 
				 easier to cleanup, and this allows to compile several configurations
			
 
				 out of the same source tree. To do so, simply enter the directory
			
 
				 where you want the compilation to produce its files, and invoke the
			
 
				-<c>configure</c> script located in the StarPU source directory.
			
 
				+script <c>configure</c> located in the StarPU source directory.
			
 
				 
			
 
				 \verbatim
			
 
				 $ mkdir build
			
@@ -139,7 +139,7 @@ $ make
 
				 Once everything is built, you may want to test the result. An
			
 
				 extensive set of regression tests is provided with StarPU. Running the
			
 
				 tests is done by calling <c>make check</c>. These tests are run every night
			
 
				-and the result from the main profile is publicly available (http://starpu.gforge.inria.fr/testing/).
			
 
				+and the result from the main profile is publicly available (http://starpu.gforge.inria.fr/testing/master/).
			
 
				 
			
 
				 \verbatim
			
 
				 $ make check
			
@@ -246,7 +246,7 @@ int main(void)
 
				     {
			
 
				         return 1;
			
 
				     }
			
 
				-    printf("%d CPU coress\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
			
 
				+    printf("%d CPU cores\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
			
 
				     printf("%d CUDA GPUs\n", starpu_worker_get_count_by_type(STARPU_CUDA_WORKER));
			
 
				     printf("%d OpenCL GPUs\n", starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER));
			
 
				     starpu_shutdown();
			
@@ -273,7 +273,7 @@ int main(void)
 
				     {
			
 
				         return 1;
			
 
				     }
			
 
				-    printf("%d CPU coress\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
			
 
				+    printf("%d CPU cores\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
			
 
				     printf("%d CUDA GPUs\n", starpu_worker_get_count_by_type(STARPU_CUDA_WORKER));
			
 
				     printf("%d OpenCL GPUs\n", starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER));
			
 
				     starpu_shutdown();
			
@@ -428,12 +428,13 @@ While StarPU tasks are executing, the application is not supposed to do
 
				 computations in the threads it starts itself, tasks should be used instead.
			
 
				 
			
 
				 If the application needs to reserve some cores for its own computations, it
			
 
				-can do so with the starpu_conf::reserve_ncpus field, get the core IDs with
			
 
				+can do so with the field starpu_conf::reserve_ncpus, get the core IDs with
			
 
				 starpu_get_next_bindid(), and bind to them with starpu_bind_thread_on().
			
 
				 
			
 
				-Another option is for the application to put the whole StarPU on pause with
			
 
				-starpu_pause() before performing its own computations, and let StarPU resume
			
 
				-executing tasks with starpu_resume().
			
 
				+Another option is for the application to pause StarPU by calling
			
 
				+starpu_pause(), then to perform its own computations, and then to
			
 
				+resume StarPU by calling starpu_resume() so that StarPU can execute
			
 
				+tasks.
			
 
				 
			
 
				 \subsection EnablingOpenCL Enabling OpenCL
			
 
				 
			
@@ -499,12 +500,12 @@ multiplication using BLAS and cuBLAS. They output the obtained GFlops.
 
				 
			
 
				 <c>lu_*</c> perform an LU factorization. They use different dependency primitives.
			
 
				 
			
 
				-\subsection SimulatedBenchmarks Simulated benchmarks
			
 
				+\subsection SimulatedBenchmarks Simulated Benchmarks
			
 
				 
			
 
				 It can also be convenient to try simulated benchmarks, if you want to give a try
			
 
				 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
			
 
				-using the simgrid version of StarPU: first install the simgrid simulator from
			
 
				-http://simgrid.gforge.inria.fr/ (we tested with simgrid from 3.11 to 3.16, and
			
 
				+using the SimGrid version of StarPU: first install the SimGrid simulator from
			
 
				+http://simgrid.gforge.inria.fr/ (we tested with SimGrid from 3.11 to 3.16, and
			
 
				 3.18 to 3.22, other versions may have compatibility issues, 3.17 notably does
			
 
				 not build at all. MPI simulation does not work with version 3.22),
			
 
				 then configure StarPU with \ref enable-simgrid
			
@@ -527,4 +528,6 @@ Performance models are available for <c>cholesky_*</c>, <c>lu_*</c>, <c>*gemm</c
 
				 320, 640, or 960 (plus 1440 for sirocco), and for <c>stencil</c> with block size 128x128x128, 192x192x192, and
			
 
				 256x256x256.
			
 
				 
			
 
				+Read the chapter \ref SimGridSupport for more information on the SimGrid support.
			
 
				+
			
 
				 */
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -28,8 +28,9 @@ will show roughly where time is spent, and focus correspondingly.
 
				 
			
 
				 \section CheckTaskSize Check Task Size
			
 
				 
			
 
				-Make sure that your tasks are not too small, because the StarPU runtime overhead
			
 
				-is not completely zero. You can run the tasks_size_overhead.sh script to get an
			
 
				+Make sure that your tasks are not too small, as the StarPU runtime overhead
			
 
				+is not completely zero. As explained in \ref TaskSizeOverhead, you can
			
 
				+run the script \c tasks_size_overhead.sh to get an
			
 
				 idea of the scalability of tasks depending on their duration (in µs), on your
			
 
				 own system.
			
 
				 
			
@@ -40,19 +41,18 @@ much bigger than this.
 
				 of cores, so it's better to try to get 10ms-ish tasks.
			
 
				 
			
 
				 Tasks durations can easily be observed when performance models are defined (see
			
 
				-\ref PerformanceModelExample) by using the <c>starpu_perfmodel_plot</c> or
			
 
				-<c>starpu_perfmodel_display</c> tool (see \ref PerformanceOfCodelets)
			
 
				+\ref PerformanceModelExample) by using the tools <c>starpu_perfmodel_plot</c> or
			
 
				+<c>starpu_perfmodel_display</c> (see \ref PerformanceOfCodelets)
			
 
				 
			
 
				 When using parallel tasks, the problem is even worse since StarPU has to
			
 
				-synchronize the execution of tasks.
			
 
				+synchronize the tasks execution.
			
 
				 
			
 
				 \section ConfigurationImprovePerformance Configuration Which May Improve Performance
			
 
				 
			
 
				-The \ref enable-fast "--enable-fast" \c configure option disables all
			
 
				+The \c configure option \ref enable-fast "--enable-fast" disables all
			
 
				 assertions. This makes StarPU more performant for really small tasks by
			
 
				 disabling all sanity checks. Only use this for measurements and production, not for development, since this will drop all basic checks.
			
 
				 
			
 
				-
			
 
				 \section DataRelatedFeaturesToImprovePerformance Data Related Features Which May Improve Performance
			
 
				 
			
 
				 link to \ref DataManagement
			
@@ -81,14 +81,14 @@ link to \ref StaticScheduling
 
				 
			
 
				 For proper overlapping of asynchronous GPU data transfers, data has to be pinned
			
 
				 by CUDA. Data allocated with starpu_malloc() is always properly pinned. If the
			
 
				-application is registering to StarPU some data which has not been allocated with
			
 
				-starpu_malloc(), it should use starpu_memory_pin() to pin it.
			
 
				+application registers to StarPU some data which has not been allocated with
			
 
				+starpu_malloc(), starpu_memory_pin() should be called to pin the data memory.
			
 
				 
			
 
				 Due to CUDA limitations, StarPU will have a hard time overlapping its own
			
 
				 communications and the codelet computations if the application does not use a
			
 
				 dedicated CUDA stream for its computations instead of the default stream,
			
 
				-which synchronizes all operations of the GPU. StarPU provides one by the use
			
 
				-of starpu_cuda_get_local_stream() which can be used by all CUDA codelet
			
 
				+which synchronizes all operations of the GPU. The function
			
 
				+starpu_cuda_get_local_stream() returns a stream which can be used by all CUDA codelet
			
 
				 operations to avoid this issue. For instance:
			
 
				 
			
 
				 \code{.c}
			
@@ -105,11 +105,11 @@ If some CUDA calls are made without specifying this local stream,
 
				 synchronization needs to be explicited with cudaThreadSynchronize() around these
			
 
				 calls, to make sure that they get properly synchronized with the calls using
			
 
				 the local stream. Notably, \c cudaMemcpy() and \c cudaMemset() are actually
			
 
				-asynchronous and need such explicit synchronization! Use cudaMemcpyAsync() and
			
 
				-cudaMemsetAsync() instead.
			
 
				+asynchronous and need such explicit synchronization! Use \c cudaMemcpyAsync() and
			
 
				+\c cudaMemsetAsync() instead.
			
 
				 
			
 
				-Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
			
 
				-CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
			
 
				+Calling starpu_cublas_init() will ensure StarPU to properly call the
			
 
				+CUBLAS library functions. Some libraries like Magma may however change the current stream of CUBLAS v1,
			
 
				 one then has to call <c>cublasSetKernelStream(</c>starpu_cuda_get_local_stream()<c>)</c> at
			
 
				 the beginning of the codelet to make sure that CUBLAS is really using the proper
			
 
				 stream. When using CUBLAS v2, starpu_cublas_get_local_handle() can be called to queue CUBLAS
			
@@ -147,14 +147,14 @@ triggered by the completion of the kernel.
 
				 Using the flag ::STARPU_CUDA_ASYNC also permits to enable concurrent kernel
			
 
				 execution, on cards which support it (Kepler and later, notably). This is
			
 
				 enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the
			
 
				-number of kernels to execute concurrently.  This is useful when kernels are
			
 
				+number of kernels to be executed concurrently.  This is useful when kernels are
			
 
				 small and do not feed the whole GPU with threads to run.
			
 
				 
			
 
				-Concerning memory allocation, you should really not use \c cudaMalloc/ \c cudaFree
			
 
				-within the kernel, since \c cudaFree introduces a awfully lot of synchronizations
			
 
				+Concerning memory allocation, you should really not use \c cudaMalloc()/ \c cudaFree()
			
 
				+within the kernel, since \c cudaFree() introduces a awfully lot of synchronizations
			
 
				 within CUDA itself. You should instead add a parameter to the codelet with the
			
 
				 ::STARPU_SCRATCH mode access. You can then pass to the task a handle registered
			
 
				-with the desired size but with the \c NULL pointer, that handle can even be the
			
 
				+with the desired size but with the \c NULL pointer, the handle can even be
			
 
				 shared between tasks, StarPU will allocate per-task data on the fly before task
			
 
				 execution, and reuse the allocated data between tasks.
			
 
				 
			
@@ -177,8 +177,8 @@ kernel startup and completion.
 
				 
			
 
				 It may happen that for some reason, StarPU does not make progress for a long
			
 
				 period of time.  Reason are sometimes due to contention inside StarPU, but
			
 
				-sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
			
 
				-driver, etc.
			
 
				+sometimes this is due to external reasons, such as a stuck MPI or CUDA
			
 
				+driver.
			
 
				 
			
 
				 <c>export STARPU_WATCHDOG_TIMEOUT=10000</c> (\ref STARPU_WATCHDOG_TIMEOUT)
			
 
				 
			
@@ -187,30 +187,34 @@ any task for 10ms, but lets the application continue normally. In addition to th
 
				 
			
 
				 <c>export STARPU_WATCHDOG_CRASH=1</c> (\ref STARPU_WATCHDOG_CRASH)
			
 
				 
			
 
				-raises <c>SIGABRT</c> in this condition, thus allowing to catch the situation in gdb.
			
 
				+raises <c>SIGABRT</c> in this condition, thus allowing to catch the
			
 
				+situation in \c gdb.
			
 
				+
			
 
				 It can also be useful to type <c>handle SIGABRT nopass</c> in <c>gdb</c> to be able to let
			
 
				 the process continue, after inspecting the state of the process.
			
 
				 
			
 
				 \section HowToLimitMemoryPerNode How to Limit Memory Used By StarPU And Cache Buffer Allocations
			
 
				 
			
 
				 By default, StarPU makes sure to use at most 90% of the memory of GPU devices,
			
 
				-moving data in and out of the device as appropriate and with prefetch and
			
 
				-writeback optimizations. Concerning the main memory, by default it will not
			
 
				-limit its consumption, since by default it has nowhere to push the data to when
			
 
				-memory gets tight. This also means that by default StarPU will not cache buffer
			
 
				-allocations in main memory, since it does not know how much of the system memory
			
 
				-it can afford.
			
 
				-
			
 
				-In the case of GPUs, the \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM,
			
 
				-\ref STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM environment variables
			
 
				-can be used to control how
			
 
				-much (in MiB) of the GPU device memory should be used at most by StarPU (their
			
 
				-default values are 90% of the available memory).
			
 
				-
			
 
				-In the case of the main memory, the \ref STARPU_LIMIT_CPU_MEM environment
			
 
				-variable can be used to specify how much (in MiB) of the main memory should be
			
 
				-used at most by StarPU for buffer allocations. This way, StarPU will be able to
			
 
				-cache buffer allocations (which can be a real benefit if a lot of bufferes are
			
 
				+moving data in and out of the device as appropriate, as well as using
			
 
				+prefetch and writeback optimizations.
			
 
				+
			
 
				+The environment variables \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM,
			
 
				+\ref STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM
			
 
				+can be used to control how much (in MiB) of the GPU device memory
			
 
				+should be used at most by StarPU (the default value is to use 90% of the
			
 
				+available memory).
			
 
				+
			
 
				+By default, the usage of the main memory is not limited, as the
			
 
				+default mechanims do not provide means to evict main memory when it
			
 
				+gets too tight. This also means that by default StarPU will not cache buffer
			
 
				+allocations in main memory, since it does not know how much of the
			
 
				+system memory it can afford.
			
 
				+
			
 
				+The environment variable \ref STARPU_LIMIT_CPU_MEM can be used to
			
 
				+specify how much (in MiB) of the main memory should be used at most by
			
 
				+StarPU for buffer allocations. This way, StarPU will be able to
			
 
				+cache buffer allocations (which can be a real benefit if a lot of buffers are
			
 
				 involved, or if allocation fragmentation can become a problem), and when using
			
 
				 \ref OutOfCore, StarPU will know when it should evict data out to the disk.
			
 
				 
			
@@ -233,8 +237,8 @@ caches or data out to the disk, starpu_memory_allocate() can be used to
 
				 specify an amount of memory to be accounted for. starpu_memory_deallocate()
			
 
				 can be used to account freed memory back. Those can for instance be used by data
			
 
				 interfaces with dynamic data buffers: instead of using starpu_malloc_on_node(),
			
 
				-they would dynamically allocate data with malloc/realloc, and notify starpu of
			
 
				-the delta thanks to starpu_memory_allocate() and starpu_memory_deallocate() calls.
			
 
				+they would dynamically allocate data with \c malloc()/\c realloc(), and notify StarPU of
			
 
				+the delta by calling starpu_memory_allocate() and starpu_memory_deallocate().
			
 
				 
			
 
				 starpu_memory_get_total() and starpu_memory_get_available()
			
 
				 can be used to get an estimation of how much memory is available.
			
@@ -251,7 +255,7 @@ to reserve this amount immediately.
 
				 
			
 
				 It is possible to reduce the memory footprint of the task and data internal
			
 
				 structures of StarPU by describing the shape of your machine and/or your
			
 
				-application at the \c configure step.
			
 
				+application when calling \c configure.
			
 
				 
			
 
				 To reduce the memory footprint of the data internal structures of StarPU, one
			
 
				 can set the
			
@@ -271,28 +275,27 @@ execution. For example, in the Cholesky factorization (dense linear algebra
 
				 application), the GEMM task uses up to 3 buffers, so it is possible to set the
			
 
				 maximum number of task buffers to 3 to run a Cholesky factorization on StarPU.
			
 
				 
			
 
				-The size of the various structures of StarPU can be printed by 
			
 
				+The size of the various structures of StarPU can be printed by
			
 
				 <c>tests/microbenchs/display_structures_size</c>.
			
 
				 
			
 
				-It is also often useless to submit *all* the tasks at the same time. One can
			
 
				-make the starpu_task_submit() function block when a reasonable given number of
			
 
				-tasks have been submitted, by setting the \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS and
			
 
				-\ref STARPU_LIMIT_MAX_SUBMITTED_TASKS environment variables, for instance:
			
 
				+It is also often useless to submit *all* the tasks at the same time.
			
 
				+Task submission can be blocked when a reasonable given number of
			
 
				+tasks have been submitted, by setting the environment variables \ref
			
 
				+STARPU_LIMIT_MIN_SUBMITTED_TASKS and \ref STARPU_LIMIT_MAX_SUBMITTED_TASKS.
			
 
				 
			
 
				 <c>
			
 
				 export STARPU_LIMIT_MAX_SUBMITTED_TASKS=10000
			
 
				-
			
 
				 export STARPU_LIMIT_MIN_SUBMITTED_TASKS=9000
			
 
				 </c>
			
 
				 
			
 
				-To make StarPU block submission when 10000 tasks are submitted, and unblock
			
 
				+will make StarPU block submission when 10000 tasks are submitted, and unblock
			
 
				 submission when only 9000 tasks are still submitted, i.e. 1000 tasks have
			
 
				 completed among the 10000 which were submitted when submission was blocked. Of
			
 
				 course this may reduce parallelism if the threshold is set too low. The precise
			
 
				 balance depends on the application task graph.
			
 
				 
			
 
				 An idea of how much memory is used for tasks and data handles can be obtained by
			
 
				-setting the \ref STARPU_MAX_MEMORY_USE environment variable to <c>1</c>.
			
 
				+setting the environment variable \ref STARPU_MAX_MEMORY_USE to <c>1</c>.
			
 
				 
			
 
				 \section HowtoReuseMemory How To Reuse Memory
			
 
				 
			
@@ -303,7 +306,7 @@ tasks. For this system to work with MPI tasks, you need to submit tasks progress
 
				 of as soon as possible, because in the case of MPI receives, the allocation cache check for reusing data
			
 
				 buffers will be done at submission time, not at execution time.
			
 
				 
			
 
				-You have two options to control the task submission flow. The first one is by
			
 
				+There is two options to control the task submission flow. The first one is by
			
 
				 controlling the number of submitted tasks during the whole execution. This can
			
 
				 be done whether by setting the environment variables
			
 
				 \ref STARPU_LIMIT_MAX_SUBMITTED_TASKS and \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS to
			
@@ -348,11 +351,12 @@ To force continuing calibration,
 
				 use <c>export STARPU_CALIBRATE=1</c> (\ref STARPU_CALIBRATE). This may be necessary if your application
			
 
				 has not-so-stable performance. StarPU will force calibration (and thus ignore
			
 
				 the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
			
 
				-made on each architecture, to avoid badly scheduling tasks just because the
			
 
				+made on each architecture, to avoid bad scheduling decisions just because the
			
 
				 first measurements were not so good. Details on the current performance model status
			
 
				-can be obtained from the tool <c>starpu_perfmodel_display</c>: the <c>-l</c>
			
 
				-option lists the available performance models, and the <c>-s</c> option permits
			
 
				-to choose the performance model to be displayed. The result looks like:
			
 
				+can be obtained with the tool <c>starpu_perfmodel_display</c>: the
			
 
				+option <c>-l</c> lists the available performance models, and the
			
 
				+option <c>-s</c> allows to choose the performance model to be
			
 
				+displayed. The result looks like:
			
 
				 
			
 
				 \verbatim
			
 
				 $ starpu_perfmodel_display -s starpu_slu_lu_model_11
			
@@ -364,7 +368,7 @@ e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
 
				 ...
			
 
				 \endverbatim
			
 
				 
			
 
				-Which shows that for the LU 11 kernel with a 1MiB matrix, the average
			
 
				+which shows that for the LU 11 kernel with a 1MiB matrix, the average
			
 
				 execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
			
 
				 8 samples. It is a good idea to check this before doing actual performance
			
 
				 measurements.
			
@@ -373,7 +377,7 @@ A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
 
				 
			
 
				 \verbatim
			
 
				 $ starpu_perfmodel_plot -s starpu_slu_lu_model_11
			
 
				-4096 16384 65536 262144 1048576 4194304 
			
 
				+4096 16384 65536 262144 1048576 4194304
			
 
				 $ gnuplot starpu_starpu_slu_lu_model_11.gp
			
 
				 $ gv starpu_starpu_slu_lu_model_11.eps
			
 
				 \endverbatim
			
@@ -451,28 +455,29 @@ STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
 
				 \section OverheadProfiling Overhead Profiling
			
 
				 
			
 
				 \ref OfflinePerformanceTools can already provide an idea of to what extent and
			
 
				-which part of StarPU bring overhead on the execution time. To get a more precise
			
 
				-analysis of the parts of StarPU which bring most overhead, <c>gprof</c> can be used.
			
 
				+which part of StarPU brings an overhead on the execution time. To get a more precise
			
 
				+analysis of which parts of StarPU bring the most overhead, <c>gprof</c> can be used.
			
 
				 
			
 
				 First, recompile and reinstall StarPU with <c>gprof</c> support:
			
 
				 
			
 
				 \code
			
 
				-./configure --enable-perf-debug --disable-shared --disable-build-tests --disable-build-examples
			
 
				+../configure --enable-perf-debug --disable-shared --disable-build-tests --disable-build-examples
			
 
				 \endcode
			
 
				 
			
 
				 Make sure not to leave a dynamic version of StarPU in the target path: remove
			
 
				 any remaining <c>libstarpu-*.so</c>
			
 
				 
			
 
				 Then relink your application with the static StarPU library, make sure that
			
 
				-running <c>ldd</c> on your application does not mention any libstarpu
			
 
				+running <c>ldd</c> on your application does not mention any \c libstarpu
			
 
				 (i.e. it's really statically-linked).
			
 
				 
			
 
				 \code
			
 
				 gcc test.c -o test $(pkg-config --cflags starpu-1.3) $(pkg-config --libs starpu-1.3)
			
 
				 \endcode
			
 
				 
			
 
				-Now you can run your application, and a <c>gmon.out</c> file should appear in the
			
 
				-current directory, you can process it by running <c>gprof</c> on your application:
			
 
				+Now you can run your application, this will create a file
			
 
				+<c>gmon.out</c> in the current directory, it can be processed by
			
 
				+running <c>gprof</c> on your application:
			
 
				 
			
 
				 \code
			
 
				 gprof ./test
			
--- a/doc/doxygen/chapters/301_tasks.doxy
+++ b/doc/doxygen/chapters/301_tasks.doxy
@@ -40,7 +40,7 @@ impact that has on the target machine.
 
				 \section TaskSubmission Task Submission
			
 
				 
			
 
				 To let StarPU make online optimizations, tasks should be submitted
			
 
				-asynchronously as much as possible. Ideally, all the tasks should be
			
 
				+asynchronously as much as possible. Ideally, all tasks should be
			
 
				 submitted, and mere calls to starpu_task_wait_for_all() or
			
 
				 starpu_data_unregister() be done to wait for
			
 
				 termination. StarPU will then be able to rework the whole schedule, overlap
			
@@ -52,7 +52,7 @@ By default, StarPU will consider the tasks in the order they are submitted by
 
				 the application. If the application programmer knows that some tasks should
			
 
				 be performed in priority (for instance because their output is needed by many
			
 
				 other tasks and may thus be a bottleneck if not executed early
			
 
				-enough), the field starpu_task::priority should be set to transmit the
			
 
				+enough), the field starpu_task::priority should be set to provide the
			
 
				 priority information to StarPU.
			
 
				 
			
 
				 \section TaskDependencies Task Dependencies
			
@@ -165,14 +165,14 @@ starpu_task_insert(&dummy_big_cl,
 
				 \endcode
			
 
				 
			
 
				 The whole code for this complex data interface is available in the
			
 
				-directory <c>examples/basic_examples/dynamic_handles.c</c>.
			
 
				+file <c>examples/basic_examples/dynamic_handles.c</c>.
			
 
				 
			
 
				 \section SettingVariableDataHandlesForATask Setting a Variable Number Of Data Handles For a Task
			
 
				 
			
 
				-Normally, the number of data handles given to a task is fixed in the
			
 
				-starpu_codelet::nbuffers codelet field. This field can however be set to
			
 
				-\ref STARPU_VARIABLE_NBUFFERS, in which case the starpu_task::nbuffers task field
			
 
				-must be set, and the starpu_task::modes field (or starpu_task::dyn_modes field,
			
 
				+Normally, the number of data handles given to a task is set with
			
 
				+starpu_codelet::nbuffers. This field can however be set to
			
 
				+\ref STARPU_VARIABLE_NBUFFERS, in which case starpu_task::nbuffers
			
 
				+must be set, and starpu_task::modes (or starpu_task::dyn_modes,
			
 
				 see \ref SettingManyDataHandlesForATask) should be used to specify the modes for
			
 
				 the handles.
			
 
				 
			
@@ -215,7 +215,7 @@ struct starpu_codelet cl =
 
				 
			
 
				 Schedulers which are multi-implementation aware (only <c>dmda</c> and
			
 
				 <c>pheft</c> for now) will use the performance models of all the
			
 
				-implementations it was given, and pick the one which seems to be the fastest.
			
 
				+provided implementations, and pick the one which seems to be the fastest.
			
 
				 
			
 
				 \section EnablingImplementationAccordingToCapabilities Enabling Implementation According To Capabilities
			
 
				 
			
@@ -333,7 +333,7 @@ struct starpu_codelet cl =
 
				 };
			
 
				 \endcode
			
 
				 
			
 
				-Note: the most generic variant should be provided first, as some schedulers are
			
 
				+Note that the most generic variant should be provided first, as some schedulers are
			
 
				 not able to try the different variants.
			
 
				 
			
 
				 \section InsertTaskUtility Insert Task Utility
			
@@ -341,7 +341,7 @@ not able to try the different variants.
 
				 StarPU provides the wrapper function starpu_task_insert() to ease
			
 
				 the creation and submission of tasks.
			
 
				 
			
 
				-Here the implementation of the codelet:
			
 
				+Here the implementation of a codelet:
			
 
				 
			
 
				 \code{.c}
			
 
				 void func_cpu(void *descr[], void *_args)
			
@@ -477,7 +477,7 @@ ret = starpu_task_get_task_succs(task, sizeof(tasks)/sizeof(*tasks), tasks);
 
				 \section ParallelTasks Parallel Tasks
			
 
				 
			
 
				 StarPU can leverage existing parallel computation libraries by the means of
			
 
				-parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
			
 
				+parallel tasks. A parallel task is a task which is run by a set of CPUs
			
 
				 (called a parallel or combined worker) at the same time, by using an existing
			
 
				 parallel CPU implementation of the computation to be achieved. This can also be
			
 
				 useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
			
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -22,11 +22,16 @@ TODO: intro which mentions consistency among other things
 
				 
			
 
				 \section DataInterface Data Interface
			
 
				 
			
 
				-StarPU provides several data interfaces for programmers to describe the data layout of their application. There are predefined interfaces already available in StarPU. Users can define new data interfaces as explained in \ref DefiningANewDataInterface. All functions provided by StarPU are documented in \ref API_Data_Interfaces. You will find a short list below.
			
 
				+StarPU provides several data interfaces for programmers to describe
			
 
				+the data layout of their application. There are predefined interfaces
			
 
				+already available in StarPU. Users can define new data interfaces as
			
 
				+explained in \ref DefiningANewDataInterface. All functions provided by
			
 
				+StarPU are documented in \ref API_Data_Interfaces. You will find a
			
 
				+short list below.
			
 
				 
			
 
				 \subsection VariableDataInterface Variable Data Interface
			
 
				 
			
 
				-A variable is a given size byte element, typically a scalar. Here an
			
 
				+A variable is a given-size byte element, typically a scalar. Here an
			
 
				 example of how to register a variable data to StarPU by using
			
 
				 starpu_variable_data_register().
			
 
				 
			
--- a/doc/doxygen/chapters/380_offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/380_offline_performance_tools.doxy
@@ -438,7 +438,7 @@ histogram of the codelet execution time distribution.
 
				 
			
 
				 More than just codelet performance, it is interesting to get statistics over all
			
 
				 kinds of StarPU states (allocations, data transfers, etc.). This is particularly
			
 
				-useful to check what may have gone wrong in the accurracy of the simgrid
			
 
				+useful to check what may have gone wrong in the accurracy of the SimGrid
			
 
				 simulation.
			
 
				 
			
 
				 This requires the <c>R</c> statistical tool, with the <c>plyr</c>,
			
--- a/doc/doxygen/chapters/470_simgrid.doxy
+++ b/doc/doxygen/chapters/470_simgrid.doxy
@@ -23,14 +23,14 @@
 
				 /*! \page SimGridSupport SimGrid Support
			
 
				 
			
 
				 StarPU can use Simgrid in order to simulate execution on an arbitrary
			
 
				-platform. This was tested with simgrid from 3.11 to 3.16, and 3.18 to 3.23.
			
 
				+platform. This was tested with SimGrid from 3.11 to 3.16, and 3.18 to 3.23.
			
 
				 Other versions may have compatibility issues. 3.17 notably does not build at
			
 
				 all. MPI simulation does not work with version 3.22.
			
 
				 
			
 
				 \section Preparing Preparing Your Application For Simulation
			
 
				 
			
 
				 There are a few technical details which need to be handled for an application to
			
 
				-be simulated through Simgrid.
			
 
				+be simulated through SimGrid.
			
 
				 
			
 
				 If the application uses <c>gettimeofday</c> to make its
			
 
				 performance measurements, the real time will be used, which will be bogus. To
			
@@ -38,19 +38,19 @@ get the simulated time, it has to use starpu_timing_now() which returns the
 
				 virtual timestamp in us.
			
 
				 
			
 
				 For some technical reason, the application's .c file which contains \c main() has
			
 
				-to be recompiled with \c starpu_simgrid_wrap.h, which in the simgrid case will <c># define main()</c>
			
 
				+to be recompiled with \c starpu_simgrid_wrap.h, which in the SimGrid case will <c># define main()</c>
			
 
				 into <c>starpu_main()</c>, and it is \c libstarpu which will provide the real \c main() and
			
 
				 will call the application's \c main().
			
 
				 
			
 
				 To be able to test with crazy data sizes, one may want to only allocate
			
 
				 application data if the macro \c STARPU_SIMGRID is not defined.  Passing a <c>NULL</c> pointer to
			
 
				 \c starpu_data_register functions is fine, data will never be read/written to by
			
 
				-StarPU in Simgrid mode anyway.
			
 
				+StarPU in SimGrid mode anyway.
			
 
				 
			
 
				 To be able to run the application with e.g. CUDA simulation on a system which
			
 
				 does not have CUDA installed, one can fill the starpu_codelet::cuda_funcs with \c (void*)1, to
			
 
				 express that there is a CUDA implementation, even if one does not actually
			
 
				-provide it. StarPU will not actually run it in Simgrid mode anyway by default
			
 
				+provide it. StarPU will not actually run it in SimGrid mode anyway by default
			
 
				 (unless the ::STARPU_CODELET_SIMGRID_EXECUTE or ::STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
			
 
				 flags are set in the codelet)
			
 
				 
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -647,7 +647,7 @@ STARPU_MPI_DRIVER_CALL_FREQUENCY environment variable set to a positive value.
 
				 \anchor STARPU_SIMGRID_TRANSFER_COST
			
 
				 \addindex __env__STARPU_SIMGRID_TRANSFER_COST
			
 
				 When set to 1 (which is the default), data transfers (over PCI bus, typically) are taken into account
			
 
				-in simgrid mode.
			
 
				+in SimGrid mode.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_SIMGRID_CUDA_MALLOC_COST</dt>
			
@@ -655,7 +655,7 @@ in simgrid mode.
 
				 \anchor STARPU_SIMGRID_CUDA_MALLOC_COST
			
 
				 \addindex __env__STARPU_SIMGRID_CUDA_MALLOC_COST
			
 
				 When set to 1 (which is the default), CUDA malloc costs are taken into account
			
 
				-in simgrid mode.
			
 
				+in SimGrid mode.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_SIMGRID_CUDA_QUEUE_COST</dt>
			
@@ -663,14 +663,14 @@ in simgrid mode.
 
				 \anchor STARPU_SIMGRID_CUDA_QUEUE_COST
			
 
				 \addindex __env__STARPU_SIMGRID_CUDA_QUEUE_COST
			
 
				 When set to 1 (which is the default), CUDA task and transfer queueing costs are
			
 
				-taken into account in simgrid mode.
			
 
				+taken into account in SimGrid mode.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_PCI_FLAT</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_PCI_FLAT
			
 
				 \addindex __env__STARPU_PCI_FLAT
			
 
				-When unset or set to 0, the platform file created for simgrid will
			
 
				+When unset or set to 0, the platform file created for SimGrid will
			
 
				 contain PCI bandwidths and routes.
			
 
				 </dd>
			
 
				 
			
@@ -678,7 +678,7 @@ contain PCI bandwidths and routes.
 
				 <dd>
			
 
				 \anchor STARPU_SIMGRID_QUEUE_MALLOC_COST
			
 
				 \addindex __env__STARPU_SIMGRID_QUEUE_MALLOC_COST
			
 
				-When unset or set to 1, simulate within simgrid the GPU transfer queueing.
			
 
				+When unset or set to 1, simulate within SimGrid the GPU transfer queueing.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_MALLOC_SIMULATION_FOLD</dt>
			
@@ -695,7 +695,7 @@ MiB. The default is 1, thus allowing 64GiB virtual memory when Linux's
 
				 \anchor STARPU_SIMGRID_TASK_SUBMIT_COST
			
 
				 \addindex __env__STARPU_SIMGRID_TASK_SUBMIT_COST
			
 
				 When set to 1 (which is the default), task submission costs are taken into
			
 
				-account in simgrid mode. This provides more accurate simgrid predictions,
			
 
				+account in SimGrid mode. This provides more accurate SimGrid predictions,
			
 
				 especially for the beginning of the execution.
			
 
				 </dd>
			
 
				 
			
@@ -704,7 +704,7 @@ especially for the beginning of the execution.
 
				 \anchor STARPU_SIMGRID_FETCHING_INPUT_COST
			
 
				 \addindex __env__STARPU_SIMGRID_FETCHING_INPUT_COST
			
 
				 When set to 1 (which is the default), fetching input costs are taken into
			
 
				-account in simgrid mode. This provides more accurate simgrid predictions,
			
 
				+account in SimGrid mode. This provides more accurate SimGrid predictions,
			
 
				 especially regarding data transfers.
			
 
				 </dd>
			
 
				 
			
@@ -713,7 +713,7 @@ especially regarding data transfers.
 
				 \anchor STARPU_SIMGRID_SCHED_COST
			
 
				 \addindex __env__STARPU_SIMGRID_SCHED_COST
			
 
				 When set to 1 (0 is the default), scheduling costs are taken into
			
 
				-account in simgrid mode. This provides more accurate simgrid predictions,
			
 
				+account in SimGrid mode. This provides more accurate SimGrid predictions,
			
 
				 and allows studying scheduling overhead of the runtime system. However,
			
 
				 it also makes simulation non-deterministic.
			
 
				 </dd>
			
@@ -1174,7 +1174,7 @@ average.
 
				 \addindex __env__STARPU_RAND_SEED
			
 
				 The random scheduler and some examples use random numbers for their own
			
 
				 working. Depending on the examples, the seed is by default juste always 0 or
			
 
				-the current time() (unless simgrid mode is enabled, in which case it is always
			
 
				+the current time() (unless SimGrid mode is enabled, in which case it is always
			
 
				 0). \ref STARPU_RAND_SEED allows to set the seed to a specific value.
			
 
				 </dd>
			
 
				 
			
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011-2013,2015-2017                      Inria
			
 
				- * Copyright (C) 2010-2017, 2019                                CNRS
			
 
				+ * Copyright (C) 2010-2017, 2019                          CNRS
			
 
				  * Copyright (C) 2009-2011,2013-2018                      Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -281,7 +281,7 @@ contain the OpenCL shared libraries---e.g. <c>libOpenCL.so</c>. This defaults to
 
				 \addindex __configure__--enable-opencl-simulator
			
 
				 Enable considering the provided OpenCL implementation as a simulator, i.e. use
			
 
				 the kernel duration returned by OpenCL profiling information as wallclock time
			
 
				-instead of the actual measured real time. This requires simgrid support.
			
 
				+instead of the actual measured real time. This requires the SimGrid support.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>--enable-maximplementations=<c>count</c></dt>
			
@@ -679,10 +679,10 @@ Enable memory statistics (\ref MemoryFeedback).
 
				 <dd>
			
 
				 \anchor enable-simgrid
			
 
				 \addindex __configure__--enable-simgrid
			
 
				-Enable simulation of execution in simgrid, to allow easy experimentation with
			
 
				+Enable simulation of execution in SimGrid, to allow easy experimentation with
			
 
				 various numbers of cores and GPUs, or amount of memory, etc. Experimental.
			
 
				 
			
 
				-The path to simgrid can be specified through the <c>SIMGRID_CFLAGS</c> and
			
 
				+The path to SimGrid can be specified through the <c>SIMGRID_CFLAGS</c> and
			
 
				 <c>SIMGRID_LIBS</c> environment variables, for instance:
			
 
				 
			
 
				 \verbatim
			
@@ -727,7 +727,7 @@ Use the smpirun at <c>path</c>
 
				 <dd>
			
 
				 \anchor enable-simgrid-mc
			
 
				 \addindex __configure__--enable-simgrid-mc
			
 
				-Enable the Model Checker in simulation of execution in simgrid, to allow
			
 
				+Enable the Model Checker in simulation of execution in SimGrid, to allow
			
 
				 exploring various execution paths.
			
 
				 </dd>
			
 
				 
			
--- a/doc/doxygen/chapters/api/threads.doxy
+++ b/doc/doxygen/chapters/api/threads.doxy
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2017                                CNRS
			
 
				+ * Copyright (C) 2010-2017, 2019                          CNRS
			
 
				  * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
			
 
				  * Copyright (C) 2011,2012                                Inria
			
 
				  *
			
@@ -20,7 +20,7 @@
 
				 
			
 
				 \brief This section describes the thread facilities provided
			
 
				 by StarPU. The thread function are either implemented on top of the
			
 
				-pthread library or the Simgrid library when the simulated performance
			
 
				+pthread library or the SimGrid library when the simulated performance
			
 
				 mode is enabled (\ref SimGridSupport).
			
 
				 
			
 
				 \def STARPU_PTHREAD_CREATE_ON
			
@@ -359,8 +359,8 @@ todo
 
				 \fn void starpu_sleep(float nb_sec)
			
 
				 \ingroup API_Threads
			
 
				 Similar to calling Unix' \c sleep function, except that it takes a float
			
 
				-to allow sub-second sleeping, and when StarPU is compiled in simgrid mode it
			
 
				-does not really sleep but just makes simgrid record that the thread has taken
			
 
				+to allow sub-second sleeping, and when StarPU is compiled in SimGrid mode it
			
 
				+does not really sleep but just makes SimGrid record that the thread has taken
			
 
				 some time to sleep.
			
 
				 
			
 
				 */
			
--- a/doc/doxygen/refman.tex
+++ b/doc/doxygen/refman.tex
@@ -37,7 +37,7 @@ Generated by Doxygen.
 
				 This manual documents the usage of StarPU version \STARPUVERSION. Its contents
			
 
				 was last updated on \STARPUUPDATED.\\
			
 
				 
			
 
				-Copyright © 2009–2018 Université de Bordeaux\\
			
 
				+Copyright © 2009–2018 Université de Bordeaux
			
 
				 
			
 
				 Copyright © 2010-2018 CNRS
			
 
				 
			
@@ -335,7 +335,7 @@ Documentation License”.
 
				 \hypertarget{GNUFreeDocumentationLicense}{}
			
 
				 \input{GNUFreeDocumentationLicense}
			
 
				 
			
 
				-\part{Index}
			
 
				+%\part{Index}
			
 
				 \addcontentsline{toc}{chapter}{Index}
			
 
				 \printindex