преди 4 години · 5f4c7f1bb2
--- a/doc/doxygen/chapters/310_data_management.doxy
+++ b/doc/doxygen/chapters/310_data_management.doxy
@@ -1124,7 +1124,7 @@ according to the interface, passing it the pointers, and checking whether it
 
				 returned \c -EAGAIN, which means the copy is asynchronous, and StarPU will
			
 
				 appropriately wait for it thanks to the pointer \c async_data.
			
 
				 
			
 
				-This copy method is referenced in a structure \ref starpu_data_copy_methods:
			
 
				+This copy method is referenced in a structure \ref starpu_data_copy_methods
			
 
				 
			
 
				 \code{.c}
			
 
				 static const struct starpu_data_copy_methods complex_copy_methods =
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -23,146 +23,8 @@ the following environment variables.
 
				 
			
 
				 \section EnvConfiguringWorkers Configuring Workers
			
 
				 
			
 
				+\subsection Basic General Configuration
			
 
				 <dl>
			
 
				-
			
 
				-<dt>STARPU_NCPU</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_NCPU
			
 
				-\addindex __env__STARPU_NCPU
			
 
				-Specify the number of CPU workers (thus not including workers
			
 
				-dedicated to control accelerators). Note that by default, StarPU will
			
 
				-not allocate more CPU workers than there are physical CPUs, and that
			
 
				-some CPUs are used to control the accelerators.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_RESERVE_NCPU</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_RESERVE_NCPU
			
 
				-\addindex __env__STARPU_RESERVE_NCPU
			
 
				-Specify the number of CPU cores that should not be used by StarPU, so the
			
 
				-application can use starpu_get_next_bindid() and starpu_bind_thread_on() to bind
			
 
				-its own threads.
			
 
				-
			
 
				-This option is ignored if \ref STARPU_NCPU or starpu_conf::ncpus is set.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_NCPUS</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_NCPUS
			
 
				-\addindex __env__STARPU_NCPUS
			
 
				-This variable is deprecated. You should use \ref STARPU_NCPU.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_NCUDA</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_NCUDA
			
 
				-\addindex __env__STARPU_NCUDA
			
 
				-Specify the number of CUDA devices that StarPU can use. If
			
 
				-\ref STARPU_NCUDA is lower than the number of physical devices, it is
			
 
				-possible to select which CUDA devices should be used by the means of the
			
 
				-environment variable \ref STARPU_WORKERS_CUDAID. By default, StarPU will
			
 
				-create as many CUDA workers as there are CUDA devices.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_NWORKER_PER_CUDA</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_NWORKER_PER_CUDA
			
 
				-\addindex __env__STARPU_NWORKER_PER_CUDA
			
 
				-Specify the number of workers per CUDA device, and thus the number of kernels
			
 
				-which will be concurrently running on the devices, i.e. the number of CUDA
			
 
				-streams. The default value is 1.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_CUDA_THREAD_PER_WORKER</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_CUDA_THREAD_PER_WORKER
			
 
				-\addindex __env__STARPU_CUDA_THREAD_PER_WORKER
			
 
				-Specify whether the cuda driver should use one thread per stream (1) or to use
			
 
				-a single thread to drive all the streams of the device or all devices (0), and
			
 
				-\ref STARPU_CUDA_THREAD_PER_DEV determines whether is it one thread per device or one
			
 
				-thread for all devices. The default value is 0. Setting it to 1 is contradictory
			
 
				-with setting \ref STARPU_CUDA_THREAD_PER_DEV.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_CUDA_THREAD_PER_DEV</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_CUDA_THREAD_PER_DEV
			
 
				-\addindex __env__STARPU_CUDA_THREAD_PER_DEV
			
 
				-Specify whether the cuda driver should use one thread per device (1) or to use a
			
 
				-single thread to drive all the devices (0). The default value is 1.  It does not
			
 
				-make sense to set this variable if \ref STARPU_CUDA_THREAD_PER_WORKER is set to to 1
			
 
				-(since \ref STARPU_CUDA_THREAD_PER_DEV is then meaningless).
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_CUDA_PIPELINE</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_CUDA_PIPELINE
			
 
				-\addindex __env__STARPU_CUDA_PIPELINE
			
 
				-Specify how many asynchronous tasks are submitted in advance on CUDA
			
 
				-devices. This for instance permits to overlap task management with the execution
			
 
				-of previous tasks, but it also allows concurrent execution on Fermi cards, which
			
 
				-otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
			
 
				-execution of all tasks.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_NOPENCL</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_NOPENCL
			
 
				-\addindex __env__STARPU_NOPENCL
			
 
				-OpenCL equivalent of the environment variable \ref STARPU_NCUDA.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_OPENCL_PIPELINE</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_OPENCL_PIPELINE
			
 
				-\addindex __env__STARPU_OPENCL_PIPELINE
			
 
				-Specify how many asynchronous tasks are submitted in advance on OpenCL
			
 
				-devices. This for instance permits to overlap task management with the execution
			
 
				-of previous tasks, but it also allows concurrent execution on Fermi cards, which
			
 
				-otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
			
 
				-execution of all tasks.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_OPENCL_ON_CPUS</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_OPENCL_ON_CPUS
			
 
				-\addindex __env__STARPU_OPENCL_ON_CPUS
			
 
				-By default, the OpenCL driver only enables GPU and accelerator
			
 
				-devices. By setting the environment variable \ref STARPU_OPENCL_ON_CPUS
			
 
				-to 1, the OpenCL driver will also enable CPU devices.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_OPENCL_ONLY_ON_CPUS</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_OPENCL_ONLY_ON_CPUS
			
 
				-\addindex __env__STARPU_OPENCL_ONLY_ON_CPUS
			
 
				-By default, the OpenCL driver enables GPU and accelerator
			
 
				-devices. By setting the environment variable \ref STARPU_OPENCL_ONLY_ON_CPUS
			
 
				-to 1, the OpenCL driver will ONLY enable CPU devices.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_NMPI_MS</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_NMPI_MS
			
 
				-\addindex __env__STARPU_NMPI_MS
			
 
				-MPI Master Slave equivalent of the environment variable \ref STARPU_NCUDA, i.e. the number of
			
 
				-MPI Master Slave devices to use.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_NMPIMSTHREADS</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_NMPIMSTHREADS
			
 
				-\addindex __env__STARPU_NMPIMSTHREADS
			
 
				-Number of threads to use on the MPI Slave devices.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_MPI_MASTER_NODE</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_MPI_MASTER_NODE
			
 
				-\addindex __env__STARPU_MPI_MASTER_NODE
			
 
				-This variable allows to chose which MPI node (with the MPI ID) will be the master.
			
 
				-</dd>
			
 
				-
			
 
				 <dt>STARPU_WORKERS_NOBIND</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_WORKERS_NOBIND
			
@@ -247,58 +109,6 @@ Same as \ref STARPU_MAIN_THREAD_CPUID, but bind the thread that calls
 
				 starpu_initialize() to the given core, instead of the PU (hyperthread).
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_MPI_THREAD_CPUID</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_MPI_THREAD_CPUID
			
 
				-\addindex __env__STARPU_MPI_THREAD_CPUID
			
 
				-When defined, this make StarPU bind its MPI thread to the given CPU ID. Setting
			
 
				-it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
			
 
				-workers.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_MPI_THREAD_COREID</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_MPI_THREAD_COREID
			
 
				-\addindex __env__STARPU_MPI_THREAD_COREID
			
 
				-Same as \ref STARPU_MPI_THREAD_CPUID, but bind the MPI thread to the given core
			
 
				-ID, instead of the PU (hyperthread).
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_MPI_NOBIND</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_MPI_NOBIND
			
 
				-\addindex __env__STARPU_MPI_NOBIND
			
 
				-Setting it to non-zero will prevent StarPU from binding the MPI to
			
 
				-a separate core. This is for instance useful when running the testsuite on a single system.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_WORKERS_CUDAID</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_WORKERS_CUDAID
			
 
				-\addindex __env__STARPU_WORKERS_CUDAID
			
 
				-Similarly to the \ref STARPU_WORKERS_CPUID environment variable, it is
			
 
				-possible to select which CUDA devices should be used by StarPU. On a machine
			
 
				-equipped with 4 GPUs, setting <c>STARPU_WORKERS_CUDAID = "1 3"</c> and
			
 
				-<c>STARPU_NCUDA=2</c> specifies that 2 CUDA workers should be created, and that
			
 
				-they should use CUDA devices #1 and #3 (the logical ordering of the devices is
			
 
				-the one reported by CUDA).
			
 
				-
			
 
				-This variable is ignored if the field
			
 
				-starpu_conf::use_explicit_workers_cuda_gpuid passed to starpu_init()
			
 
				-is set.
			
 
				-</dd>
			
 
				-
			
 
				-<dt>STARPU_WORKERS_OPENCLID</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_WORKERS_OPENCLID
			
 
				-\addindex __env__STARPU_WORKERS_OPENCLID
			
 
				-OpenCL equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
			
 
				-
			
 
				-This variable is ignored if the field
			
 
				-starpu_conf::use_explicit_workers_opencl_gpuid passed to starpu_init()
			
 
				-is set.
			
 
				-</dd>
			
 
				-
			
 
				 <dt>STARPU_WORKER_TREE</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_WORKER_TREE
			
@@ -320,25 +130,23 @@ and \ref STARPU_MAX_WORKERSIZE can be used to change this default.
 
				 <dd>
			
 
				 \anchor STARPU_MIN_WORKERSIZE
			
 
				 \addindex __env__STARPU_MIN_WORKERSIZE
			
 
				-\ref STARPU_MIN_WORKERSIZE
			
 
				-permits to specify the minimum size of the combined workers (instead of the default 2)
			
 
				+Specify the minimum size of the combined workers. Default value is 2.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_MAX_WORKERSIZE</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_MAX_WORKERSIZE
			
 
				 \addindex __env__STARPU_MAX_WORKERSIZE
			
 
				-\ref STARPU_MAX_WORKERSIZE
			
 
				-permits to specify the minimum size of the combined workers (instead of the
			
 
				-number of CPU workers in the system)
			
 
				+Specify the minimum size of the combined workers. Default value is the
			
 
				+number of CPU workers in the system.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
			
 
				 \addindex __env__STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
			
 
				-Let the user decide how many elements are allowed between combined workers
			
 
				-created from hwloc information. For instance, in the case of sockets with 6
			
 
				+Specify how many elements are allowed between combined workers
			
 
				+created from \c hwloc information. For instance, in the case of sockets with 6
			
 
				 cores without shared L2 caches, if \ref STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER is
			
 
				 set to 6, no combined worker will be synthesized beyond one for the socket
			
 
				 and one per core. If it is set to 3, 3 intermediate combined workers will be
			
@@ -361,6 +169,135 @@ Disable asynchronous copies between CPU and GPU devices.
 
				 The AMD implementation of OpenCL is known to
			
 
				 fail when copying data asynchronously. When using this implementation,
			
 
				 it is therefore necessary to disable asynchronous data transfers.
			
 
				+
			
 
				+See also \ref STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY and \ref
			
 
				+STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_DISABLE_PINNING</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_DISABLE_PINNING
			
 
				+\addindex __env__STARPU_DISABLE_PINNING
			
 
				+Disable (1) or Enable (0) pinning host memory allocated through starpu_malloc(), starpu_memory_pin()
			
 
				+and friends.  The default is Enabled.
			
 
				+This permits to test the performance effect of memory pinning.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_BACKOFF_MIN</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_BACKOFF_MIN
			
 
				+\addindex __env__STARPU_BACKOFF_MIN
			
 
				+Set minimum exponential backoff of number of cycles to pause when spinning. Default value is 1.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_BACKOFF_MAX</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_BACKOFF_MAX
			
 
				+\addindex __env__STARPU_BACKOFF_MAX
			
 
				+Set maximum exponential backoff of number of cycles to pause when spinning. Default value is 32.
			
 
				+</dd>
			
 
				+</dl>
			
 
				+
			
 
				+\subsection cpuWorkers CPU Workers
			
 
				+<dl>
			
 
				+<dt>STARPU_NCPU</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_NCPU
			
 
				+\addindex __env__STARPU_NCPU
			
 
				+Specify the number of CPU workers (thus not including workers
			
 
				+dedicated to control accelerators). Note that by default, StarPU will
			
 
				+not allocate more CPU workers than there are physical CPUs, and that
			
 
				+some CPUs are used to control the accelerators.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_RESERVE_NCPU</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_RESERVE_NCPU
			
 
				+\addindex __env__STARPU_RESERVE_NCPU
			
 
				+Specify the number of CPU cores that should not be used by StarPU, so the
			
 
				+application can use starpu_get_next_bindid() and starpu_bind_thread_on() to bind
			
 
				+its own threads.
			
 
				+
			
 
				+This option is ignored if \ref STARPU_NCPU or starpu_conf::ncpus is set.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_NCPUS</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_NCPUS
			
 
				+\addindex __env__STARPU_NCPUS
			
 
				+This variable is deprecated. You should use \ref STARPU_NCPU.
			
 
				+</dd>
			
 
				+
			
 
				+</dl>
			
 
				+
			
 
				+\subsection cudaWorkers CUDA Workers
			
 
				+<dl>
			
 
				+<dt>STARPU_NCUDA</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_NCUDA
			
 
				+\addindex __env__STARPU_NCUDA
			
 
				+Specify the number of CUDA devices that StarPU can use. If
			
 
				+\ref STARPU_NCUDA is lower than the number of physical devices, it is
			
 
				+possible to select which GPU devices should be used by the means of the
			
 
				+environment variable \ref STARPU_WORKERS_CUDAID. By default, StarPU will
			
 
				+create as many CUDA workers as there are GPU devices.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_NWORKER_PER_CUDA</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_NWORKER_PER_CUDA
			
 
				+\addindex __env__STARPU_NWORKER_PER_CUDA
			
 
				+Specify the number of workers per CUDA device, and thus the number of kernels
			
 
				+which will be concurrently running on the devices, i.e. the number of CUDA
			
 
				+streams. The default value is 1.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_CUDA_THREAD_PER_WORKER</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_CUDA_THREAD_PER_WORKER
			
 
				+\addindex __env__STARPU_CUDA_THREAD_PER_WORKER
			
 
				+Specify whether the cuda driver should use one thread per stream (1) or to use
			
 
				+a single thread to drive all the streams of the device or all devices (0), and
			
 
				+\ref STARPU_CUDA_THREAD_PER_DEV determines whether is it one thread per device or one
			
 
				+thread for all devices. The default value is 0. Setting it to 1 is contradictory
			
 
				+with setting \ref STARPU_CUDA_THREAD_PER_DEV.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_CUDA_THREAD_PER_DEV</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_CUDA_THREAD_PER_DEV
			
 
				+\addindex __env__STARPU_CUDA_THREAD_PER_DEV
			
 
				+Specify whether the cuda driver should use one thread per device (1) or to use a
			
 
				+single thread to drive all the devices (0). The default value is 1.  It does not
			
 
				+make sense to set this variable if \ref STARPU_CUDA_THREAD_PER_WORKER is set to to 1
			
 
				+(since \ref STARPU_CUDA_THREAD_PER_DEV is then meaningless).
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_CUDA_PIPELINE</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_CUDA_PIPELINE
			
 
				+\addindex __env__STARPU_CUDA_PIPELINE
			
 
				+Specify how many asynchronous tasks are submitted in advance on CUDA
			
 
				+devices. This for instance permits to overlap task management with the execution
			
 
				+of previous tasks, but it also allows concurrent execution on Fermi cards, which
			
 
				+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
			
 
				+execution of all tasks.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_WORKERS_CUDAID</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_WORKERS_CUDAID
			
 
				+\addindex __env__STARPU_WORKERS_CUDAID
			
 
				+Similarly to the \ref STARPU_WORKERS_CPUID environment variable, it is
			
 
				+possible to select which CUDA devices should be used by StarPU. On a machine
			
 
				+equipped with 4 GPUs, setting <c>STARPU_WORKERS_CUDAID = "1 3"</c> and
			
 
				+<c>STARPU_NCUDA=2</c> specifies that 2 CUDA workers should be created, and that
			
 
				+they should use CUDA devices #1 and #3 (the logical ordering of the devices is
			
 
				+the one reported by CUDA).
			
 
				+
			
 
				+This variable is ignored if the field
			
 
				+starpu_conf::use_explicit_workers_cuda_gpuid passed to starpu_init()
			
 
				+is set.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY</dt>
			
@@ -368,6 +305,90 @@ it is therefore necessary to disable asynchronous data transfers.
 
				 \anchor STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
			
 
				 \addindex __env__STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
			
 
				 Disable asynchronous copies between CPU and CUDA devices.
			
 
				+
			
 
				+See also \ref STARPU_DISABLE_ASYNCHRONOUS_COPY and \ref
			
 
				+STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_ENABLE_CUDA_GPU_GPU_DIRECT</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
			
 
				+\addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
			
 
				+Enable (1) or Disable (0) direct CUDA transfers from GPU to GPU, without copying
			
 
				+through RAM. The default is Enabled.
			
 
				+This permits to test the performance effect of GPU-Direct.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
			
 
				+\addindex __env__STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
			
 
				+Specify if CUDA workers should do only fast allocations
			
 
				+when running the datawizard progress of
			
 
				+other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
			
 
				+Default value is 0, allowing CUDA workers to do slow allocations.
			
 
				+</dd>
			
 
				+
			
 
				+</dl>
			
 
				+
			
 
				+\subsection openclWorkers OpenCL Workers
			
 
				+<dl>
			
 
				+<dt>STARPU_NOPENCL</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_NOPENCL
			
 
				+\addindex __env__STARPU_NOPENCL
			
 
				+Specify the number of OpenCL devices that StarPU can use. If
			
 
				+\ref STARPU_NOPENCL is lower than the number of physical devices, it is
			
 
				+possible to select which GPU devices should be used by the means of the
			
 
				+environment variable \ref STARPU_WORKERS_OPENCLID. By default, StarPU will
			
 
				+create as many OpenCL workers as there are GPU devices.
			
 
				+
			
 
				+Note that by default StarPU will launch CUDA workers on GPU devices.
			
 
				+You need to disable CUDA to allow the creation of OpenCL workers.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_WORKERS_OPENCLID</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_WORKERS_OPENCLID
			
 
				+\addindex __env__STARPU_WORKERS_OPENCLID
			
 
				+Similarly to the \ref STARPU_WORKERS_CPUID environment variable, it is
			
 
				+possible to select which GPU devices should be used by StarPU. On a machine
			
 
				+equipped with 4 GPUs, setting <c>STARPU_WORKERS_OPENCLID = "1 3"</c> and
			
 
				+<c>STARPU_NOPENCL=2</c> specifies that 2 OpenCL workers should be
			
 
				+created, and that they should use GPU devices #1 and #3.
			
 
				+
			
 
				+This variable is ignored if the field
			
 
				+starpu_conf::use_explicit_workers_opencl_gpuid passed to starpu_init()
			
 
				+is set.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_OPENCL_PIPELINE</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_OPENCL_PIPELINE
			
 
				+\addindex __env__STARPU_OPENCL_PIPELINE
			
 
				+Specify how many asynchronous tasks are submitted in advance on OpenCL
			
 
				+devices. This for instance permits to overlap task management with the execution
			
 
				+of previous tasks, but it also allows concurrent execution on Fermi cards, which
			
 
				+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
			
 
				+execution of all tasks.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_OPENCL_ON_CPUS</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_OPENCL_ON_CPUS
			
 
				+\addindex __env__STARPU_OPENCL_ON_CPUS
			
 
				+By default, the OpenCL driver only enables GPU and accelerator
			
 
				+devices. By setting the environment variable \ref STARPU_OPENCL_ON_CPUS
			
 
				+to 1, the OpenCL driver will also enable CPU devices.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_OPENCL_ONLY_ON_CPUS</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_OPENCL_ONLY_ON_CPUS
			
 
				+\addindex __env__STARPU_OPENCL_ONLY_ON_CPUS
			
 
				+By default, the OpenCL driver enables GPU and accelerator
			
 
				+devices. By setting the environment variable \ref STARPU_OPENCL_ONLY_ON_CPUS
			
 
				+to 1, the OpenCL driver will ONLY enable CPU devices.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY</dt>
			
@@ -378,59 +399,76 @@ Disable asynchronous copies between CPU and OpenCL devices.
 
				 The AMD implementation of OpenCL is known to
			
 
				 fail when copying data asynchronously. When using this implementation,
			
 
				 it is therefore necessary to disable asynchronous data transfers.
			
 
				+
			
 
				+See also \ref STARPU_DISABLE_ASYNCHRONOUS_COPY and \ref
			
 
				+STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY.
			
 
				 </dd>
			
 
				+</dl>
			
 
				 
			
 
				-<dt>STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY</dt>
			
 
				+
			
 
				+\subsection mpimsWorkers MPI Master Slave Workers
			
 
				+<dl>
			
 
				+<dt>STARPU_NMPI_MS</dt>
			
 
				 <dd>
			
 
				-\anchor STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
			
 
				-\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
			
 
				-Disable asynchronous copies between CPU and MPI Slave devices.
			
 
				+\anchor STARPU_NMPI_MS
			
 
				+\addindex __env__STARPU_NMPI_MS
			
 
				+Specify the number of MPI master slave devices that StarPU can use.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_ENABLE_CUDA_GPU_GPU_DIRECT</dt>
			
 
				+<dt>STARPU_NMPIMSTHREADS</dt>
			
 
				 <dd>
			
 
				-\anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
			
 
				-\addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
			
 
				-Enable (1) or Disable (0) direct CUDA transfers from GPU to GPU, without copying
			
 
				-through RAM. The default is Enabled.
			
 
				-This permits to test the performance effect of GPU-Direct.
			
 
				+\anchor STARPU_NMPIMSTHREADS
			
 
				+\addindex __env__STARPU_NMPIMSTHREADS
			
 
				+Number of threads to use on the MPI Slave devices.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_DISABLE_PINNING</dt>
			
 
				+<dt>STARPU_MPI_MASTER_NODE</dt>
			
 
				 <dd>
			
 
				-\anchor STARPU_DISABLE_PINNING
			
 
				-\addindex __env__STARPU_DISABLE_PINNING
			
 
				-Disable (1) or Enable (0) pinning host memory allocated through starpu_malloc, starpu_memory_pin
			
 
				-and friends.  The default is Enabled.
			
 
				-This permits to test the performance effect of memory pinning.
			
 
				+\anchor STARPU_MPI_MASTER_NODE
			
 
				+\addindex __env__STARPU_MPI_MASTER_NODE
			
 
				+This variable allows to chose which MPI node (with the MPI ID) will be the master.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_BACKOFF_MIN</dt>
			
 
				+<dt>STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY</dt>
			
 
				 <dd>
			
 
				-\anchor STARPU_BACKOFF_MIN
			
 
				-\addindex __env__STARPU_BACKOFF_MIN
			
 
				-Set minimum exponential backoff of number of cycles to pause when spinning. Default value is 1.
			
 
				+\anchor STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
			
 
				+\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
			
 
				+Disable asynchronous copies between CPU and MPI Slave devices.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_BACKOFF_MAX</dt>
			
 
				+</dl>
			
 
				+
			
 
				+\subsection mpiConf MPI Configuration
			
 
				+<dl>
			
 
				+
			
 
				+<dt>STARPU_MPI_THREAD_CPUID</dt>
			
 
				 <dd>
			
 
				-\anchor STARPU_BACKOFF_MAX
			
 
				-\addindex __env__STARPU_BACKOFF_MAX
			
 
				-Set maximum exponential backoff of number of cycles to pause when spinning. Default value is 32.
			
 
				+\anchor STARPU_MPI_THREAD_CPUID
			
 
				+\addindex __env__STARPU_MPI_THREAD_CPUID
			
 
				+When defined, this make StarPU bind its MPI thread to the given CPU ID. Setting
			
 
				+it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
			
 
				+workers.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES</dt>
			
 
				+<dt>STARPU_MPI_THREAD_COREID</dt>
			
 
				 <dd>
			
 
				-\anchor STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
			
 
				-\addindex __env__STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
			
 
				-Specify if CUDA workers should do only fast allocations
			
 
				-when running the datawizard progress of
			
 
				-other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
			
 
				-Default value is 0, allowing CUDA workers to do slow allocations.
			
 
				+\anchor STARPU_MPI_THREAD_COREID
			
 
				+\addindex __env__STARPU_MPI_THREAD_COREID
			
 
				+Same as \ref STARPU_MPI_THREAD_CPUID, but bind the MPI thread to the given core
			
 
				+ID, instead of the PU (hyperthread).
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_MPI_NOBIND</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_MPI_NOBIND
			
 
				+\addindex __env__STARPU_MPI_NOBIND
			
 
				+Setting it to non-zero will prevent StarPU from binding the MPI to
			
 
				+a separate core. This is for instance useful when running the testsuite on a single system.
			
 
				 </dd>
			
 
				 
			
 
				 </dl>
			
 
				 
			
 
				+
			
 
				 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
			
 
				 
			
 
				 <dl>
			
@@ -791,13 +829,6 @@ and allows studying scheduling overhead of the runtime system. However,
 
				 it also makes simulation non-deterministic.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_SINK</dt>
			
 
				-<dd>
			
 
				-\anchor STARPU_SINK
			
 
				-\addindex __env__STARPU_SINK
			
 
				-Variable defined by StarPU when running MPI Xeon PHI on the sink.
			
 
				-</dd>
			
 
				-
			
 
				 </dl>
			
 
				 
			
 
				 \section MiscellaneousAndDebug Miscellaneous And Debug