|| 
							- /* StarPU --- Runtime system for heterogeneous multicore architectures.
 
-  *
 
-  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 
-  *
 
-  * StarPU is free software; you can redistribute it and/or modify
 
-  * it under the terms of the GNU Lesser General Public License as published by
 
-  * the Free Software Foundation; either version 2.1 of the License, or (at
 
-  * your option) any later version.
 
-  *
 
-  * StarPU is distributed in the hope that it will be useful, but
 
-  * WITHOUT ANY WARRANTY; without even the implied warranty of
 
-  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 
-  *
 
-  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
 
-  */
 
- /*! \page FrequentlyAskedQuestions Frequently Asked Questions
 
- \section HowToInitializeAComputationLibraryOnceForEachWorker How To Initialize A Computation Library Once For Each Worker?
 
- Some libraries need to be initialized once for each concurrent instance that
 
- may run on the machine. For instance, a C++ computation class which is not
 
- thread-safe by itself, but for which several instanciated objects of that class
 
- can be used concurrently. This can be used in StarPU by initializing one such
 
- object per worker. For instance, the <c>libstarpufft</c> example does the following to
 
- be able to use FFTW on CPUs.
 
- Some global array stores the instanciated objects:
 
- \code{.c}
 
- fftw_plan plan_cpu[STARPU_NMAXWORKERS];
 
- \endcode
 
- At initialisation time of libstarpu, the objects are initialized:
 
- \code{.c}
 
- int workerid;
 
- for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
 
- {
 
-     switch (starpu_worker_get_type(workerid))
 
-     {
 
-         case STARPU_CPU_WORKER:
 
-             plan_cpu[workerid] = fftw_plan(...);
 
-             break;
 
-     }
 
- }
 
- \endcode
 
- And in the codelet body, they are used:
 
- \code{.c}
 
- static void fft(void *descr[], void *_args)
 
- {
 
-     int workerid = starpu_worker_get_id();
 
-     fftw_plan plan = plan_cpu[workerid];
 
-     ...
 
-     fftw_execute(plan, ...);
 
- }
 
- \endcode
 
- This however is not sufficient for FFT on CUDA: initialization has
 
- to be done from the workers themselves.  This can be done thanks to
 
- starpu_execute_on_each_worker().  For instance <c>libstarpufft</c> does the following.
 
- \code{.c}
 
- static void fft_plan_gpu(void *args)
 
- {
 
-     plan plan = args;
 
-     int n2 = plan->n2[0];
 
-     int workerid = starpu_worker_get_id();
 
-     cufftPlan1d(&plan->plans[workerid].plan_cuda, n, _CUFFT_C2C, 1);
 
-     cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
 
- }
 
- void starpufft_plan(void)
 
- {
 
-     starpu_execute_on_each_worker(fft_plan_gpu, plan, STARPU_CUDA);
 
- }
 
- \endcode
 
- \section UsingTheDriverAPI Using The Driver API
 
- \ref API_Running_Drivers
 
- \code{.c}
 
- int ret;
 
- struct starpu_driver =
 
- {
 
-     .type = STARPU_CUDA_WORKER,
 
-     .id.cuda_id = 0
 
- };
 
- ret = starpu_driver_init(&d);
 
- if (ret != 0)
 
-     error();
 
- while (some_condition)
 
- {
 
-     ret = starpu_driver_run_once(&d);
 
-     if (ret != 0)
 
-         error();
 
- }
 
- ret = starpu_driver_deinit(&d);
 
- if (ret != 0)
 
-     error();
 
- \endcode
 
- To add a new kind of device to the structure starpu_driver, one needs to:
 
- <ol>
 
- <li> Add a member to the union starpu_driver::id
 
- </li>
 
- <li> Modify the internal function <c>_starpu_launch_drivers()</c> to
 
- make sure the driver is not always launched.
 
- </li>
 
- <li> Modify the function starpu_driver_run() so that it can handle
 
- another kind of architecture.
 
- </li>
 
- <li> Write the new function <c>_starpu_run_foobar()</c> in the
 
- corresponding driver.
 
- </li>
 
- </ol>
 
- \section On-GPURendering On-GPU Rendering
 
- Graphical-oriented applications need to draw the result of their computations,
 
- typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
 
- interoperability permit to let CUDA directly work on the OpenGL buffers, making
 
- them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
 
- renderbuffer objects into CUDA.  CUDA however imposes some technical
 
- constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
 
- to be the one that runs CUDA computations for that GPU.
 
- To achieve this with StarPU, pass the option
 
- \ref disable-cuda-memcpy-peer "--disable-cuda-memcpy-peer"
 
- to <c>configure</c> (TODO: make it dynamic), OpenGL/GLUT has to be initialized
 
- first, and the interoperability mode has to
 
- be enabled by using the field
 
- starpu_conf::cuda_opengl_interoperability, and the driver loop has to
 
- be run by the application, by using the field
 
- starpu_conf::not_launched_drivers to prevent StarPU from running it in
 
- a separate thread, and by using starpu_driver_run() to run the loop.
 
- The examples <c>gl_interop</c> and <c>gl_interop_idle</c> show how it
 
- articulates in a simple case, where rendering is done in task
 
- callbacks. The former uses <c>glutMainLoopEvent</c> to make GLUT
 
- progress from the StarPU driver loop, while the latter uses
 
- <c>glutIdleFunc</c> to make StarPU progress from the GLUT main loop.
 
- Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
 
- the CUDA pointer at registration, for instance:
 
- \code{.c}
 
- /* Get the CUDA worker id */
 
- for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
 
-         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
 
-                 break;
 
- /* Build a CUDA pointer pointing at the OpenGL buffer */
 
- cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
 
- /* And register it to StarPU */
 
- starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
 
- /* The handle can now be used as usual */
 
- starpu_task_insert(&cl, STARPU_RW, handle, 0);
 
- /* ... */
 
- /* This gets back data into the OpenGL buffer */
 
- starpu_data_unregister(handle);
 
- \endcode
 
- and display it e.g. in the callback function.
 
- \section UsingStarPUWithMKL Using StarPU With MKL 11 (Intel Composer XE 2013)
 
- Some users had issues with MKL 11 and StarPU (versions 1.1rc1 and
 
- 1.0.5) on Linux with MKL, using 1 thread for MKL and doing all the
 
- parallelism using StarPU (no multithreaded tasks), setting the
 
- environment variable <c>MKL_NUM_THREADS</c> to <c>1</c>, and using the threaded MKL library,
 
- with <c>iomp5</c>.
 
- Using this configuration, StarPU only uses 1 core, no matter the value of
 
- \ref STARPU_NCPU. The problem is actually a thread pinning issue with MKL.
 
- The solution is to set the environment variable KMP_AFFINITY to <c>disabled</c>
 
- (http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/optaps/common/optaps_openmp_thread_affinity.htm).
 
- \section ThreadBindingOnNetBSD Thread Binding on NetBSD
 
- When using StarPU on a NetBSD machine, if the topology
 
- discovery library <c>hwloc</c> is used, thread binding will fail. To
 
- prevent the problem, you should at least use the version 1.7 of
 
- <c>hwloc</c>, and also issue the following call:
 
- \verbatim
 
- $ sysctl -w security.models.extensions.user_set_cpu_affinity=1
 
- \endverbatim
 
- Or add the following line in the file <c>/etc/sysctl.conf</c>
 
- \verbatim
 
- security.models.extensions.user_set_cpu_affinity=1
 
- \endverbatim
 
- \section StarPUEatsCPUs StarPU permanently eats 100% of all CPUs
 
- Yes, this is on purpose.
 
- By default, StarPU uses active polling on task queues, so as to minimize wake-up
 
- latency for better overall performance.
 
- If eating CPU time is a problem (e.g. application running on a desktop),
 
- pass option \ref enable-blocking-drivers "--enable-blocking-drivers" to
 
- <c>configure</c>. This will add some overhead when putting CPU workers to
 
- sleep or waking them, but avoid eating 100% CPU permanently.
 
- \section PauseResume Interleaving StarPU and non-StarPU code
 
- If your application only partially uses StarPU, and you do not want to
 
- call starpu_init() / starpu_shutdown() at the beginning/end
 
- of each section, StarPU workers will poll for work between the
 
- sections. To avoid this behavior, you can "pause" StarPU with the 
 
- starpu_pause() function. This will prevent the StarPU workers from
 
- accepting new work (tasks that are already in progress will not be
 
- frozen), and stop them from polling for more work.
 
- Note that this does not prevent you from submitting new tasks, but
 
- they won't execute until starpu_resume() is called. Also note
 
- that StarPU must not be paused when you call starpu_shutdown(), and
 
- that this function pair works in a push/pull manner, i.e you need to
 
- match the number of calls to these functions to clear their effect.
 
- One way to use these functions could be:
 
- \code{.c}
 
- starpu_init(NULL);
 
- starpu_pause(); // To submit all the tasks without a single one executing
 
- submit_some_tasks();
 
- starpu_resume(); // The tasks start executing
 
- starpu_task_wait_for_all();
 
- starpu_pause(); // Stop the workers from polling
 
- // Non-StarPU code
 
- starpu_resume();
 
- // ...
 
- starpu_shutdown();
 
- \endcode
 
- \section GPUEatingCores When running with CUDA or OpenCL devices, I am seeing less CPU cores
 
- Yes, this is on purpose.
 
- Since GPU devices are way faster than CPUs, StarPU needs to react quickly when
 
- a task is finished, to feed the GPU with another task (StarPU actually submits
 
- a couple of tasks in advance so as to pipeline this, but filling the pipeline
 
- still has to be happening often enough), and thus it has to dedicate threads for
 
- this, and this is a very CPU-consuming duty. StarPU thus dedicates one CPU core
 
- for driving each GPU by default.
 
- Such dedication is also useful when a codelet is hybrid, i.e. while kernels are
 
- running on the GPU, the codelet can run some computation, which thus be run by
 
- the CPU core instead of driving the GPU.
 
- One can choose to dedicate only one thread for all the CUDA devices by setting
 
- the \ref STARPU_CUDA_THREAD_PER_DEV environment variable to \c 1. The application
 
- however should use ::STARPU_CUDA_ASYNC on its CUDA codelets (asynchronous
 
- execution), otherwise the execution of a synchronous CUDA codelet will
 
- monopolize the thread, and other CUDA devices will thus starve while it is
 
- executing.
 
- \section CUDADrivers StarPU does not see my CUDA device
 
- First make sure that CUDA is properly running outside StarPU: build and
 
- run the following program with \c -lcudart :
 
- \code{.c}
 
- #include <stdio.h>
 
- #include <cuda.h>
 
- #include <cuda_runtime.h>
 
- int main(void)
 
- {
 
- 	int n, i, version;
 
- 	cudaError_t err;
 
- 	err = cudaGetDeviceCount(&n);
 
- 	if (err)
 
- 	{
 
- 		fprintf(stderr,"cuda error %d\n", err);
 
- 		exit(1);
 
- 	}
 
- 	cudaDriverGetVersion(&version);
 
- 	printf("driver version %d\n", version);
 
- 	cudaRuntimeGetVersion(&version);
 
- 	printf("runtime version %d\n", version);
 
- 	printf("\n");
 
- 	for (i = 0; i < n; i++)
 
- 	{
 
- 		struct cudaDeviceProp props;
 
- 		printf("CUDA%d\n", i);
 
- 		err = cudaGetDeviceProperties(&props, i);
 
- 		if (err)
 
- 		{
 
- 			fprintf(stderr,"cuda error %d\n", err);
 
- 			continue;
 
- 		}
 
- 		printf("%s\n", props.name);
 
- 		printf("%0.3f GB\n", (float) props.totalGlobalMem / (1<<30));
 
- 		printf("%u MP\n", props.multiProcessorCount);
 
- 		printf("\n");
 
- 	}
 
- 	return 0;
 
- }
 
- \endcode
 
- If that program does not find your device, the problem is not at the StarPU
 
- level, but the CUDA drivers, check the documentation of your CUDA
 
- setup.
 
- \section OpenCLDrivers StarPU does not see my OpenCL device
 
- First make sure that OpenCL is properly running outside StarPU: build and
 
- run the following program with \c -lOpenCL :
 
- \code{.c}
 
- #include <CL/cl.h>
 
- #include <stdio.h>
 
- #include <assert.h>
 
- int main(void)
 
- {
 
-     cl_device_id did[16];
 
-     cl_int err;
 
-     cl_platform_id pid, pids[16];
 
-     cl_uint nbplat, nb;
 
-     char buf[128];
 
-     size_t size;
 
-     int i, j;
 
-     err = clGetPlatformIDs(sizeof(pids)/sizeof(pids[0]), pids, &nbplat);
 
-     assert(err == CL_SUCCESS);
 
-     printf("%u platforms\n", nbplat);
 
-     for (j = 0; j < nbplat; j++)
 
-     {
 
-         pid = pids[j];
 
-         printf("    platform %d\n", j);
 
-         err = clGetPlatformInfo(pid, CL_PLATFORM_VERSION, sizeof(buf)-1, buf, &size);
 
-         assert(err == CL_SUCCESS);
 
-         buf[size] = 0;
 
-         printf("        platform version %s\n", buf);
 
-         err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, sizeof(did)/sizeof(did[0]), did, &nb);
 
-         assert(err == CL_SUCCESS);
 
-         printf("%d devices\n", nb);
 
-         for (i = 0; i < nb; i++)
 
- 	{
 
-             err = clGetDeviceInfo(did[i], CL_DEVICE_VERSION, sizeof(buf)-1, buf, &size);
 
-             buf[size] = 0;
 
-             printf("    device %d version %s\n", i, buf);
 
-         }
 
-     }
 
-     return 0;
 
- }
 
- \endcode
 
- If that program does not find your device, the problem is not at the StarPU
 
- level, but the OpenCL drivers, check the documentation of your OpenCL
 
- implementation.
 
- \section IncorrectPerformanceModelFile I keep getting a "Incorrect performance model file" error
 
- The performance model file, used by StarPU to record the performance of
 
- codelets, seem to have been corrupted. Perhaps a previous run of StarPU stopped
 
- abruptly, and thus could not save it properly.  You can have a look at the file
 
- if you can fix it, but the simplest way is to just remove the file and run
 
- again, StarPU will just have to re-perform calibration for the corresponding codelet.
 
- */
 
 
  |