390_faq.doxy 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2017 CNRS
  4. * Copyright (C) 2009-2011,2014,2016-2017 Université de Bordeaux
  5. * Copyright (C) 2011-2012 Inria
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. /*! \page FrequentlyAskedQuestions Frequently Asked Questions
  19. \section HowToInitializeAComputationLibraryOnceForEachWorker How To Initialize A Computation Library Once For Each Worker?
  20. Some libraries need to be initialized once for each concurrent instance that
  21. may run on the machine. For instance, a C++ computation class which is not
  22. thread-safe by itself, but for which several instanciated objects of that class
  23. can be used concurrently. This can be used in StarPU by initializing one such
  24. object per worker. For instance, the <c>libstarpufft</c> example does the following to
  25. be able to use FFTW on CPUs.
  26. Some global array stores the instanciated objects:
  27. \code{.c}
  28. fftw_plan plan_cpu[STARPU_NMAXWORKERS];
  29. \endcode
  30. At initialisation time of libstarpu, the objects are initialized:
  31. \code{.c}
  32. int workerid;
  33. for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
  34. {
  35. switch (starpu_worker_get_type(workerid))
  36. {
  37. case STARPU_CPU_WORKER:
  38. plan_cpu[workerid] = fftw_plan(...);
  39. break;
  40. }
  41. }
  42. \endcode
  43. And in the codelet body, they are used:
  44. \code{.c}
  45. static void fft(void *descr[], void *_args)
  46. {
  47. int workerid = starpu_worker_get_id();
  48. fftw_plan plan = plan_cpu[workerid];
  49. ...
  50. fftw_execute(plan, ...);
  51. }
  52. \endcode
  53. This however is not sufficient for FFT on CUDA: initialization has
  54. to be done from the workers themselves. This can be done thanks to
  55. starpu_execute_on_each_worker(). For instance <c>libstarpufft</c> does the following.
  56. \code{.c}
  57. static void fft_plan_gpu(void *args)
  58. {
  59. plan plan = args;
  60. int n2 = plan->n2[0];
  61. int workerid = starpu_worker_get_id();
  62. cufftPlan1d(&plan->plans[workerid].plan_cuda, n, _CUFFT_C2C, 1);
  63. cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
  64. }
  65. void starpufft_plan(void)
  66. {
  67. starpu_execute_on_each_worker(fft_plan_gpu, plan, STARPU_CUDA);
  68. }
  69. \endcode
  70. \section UsingTheDriverAPI Using The Driver API
  71. \ref API_Running_Drivers
  72. \code{.c}
  73. int ret;
  74. struct starpu_driver =
  75. {
  76. .type = STARPU_CUDA_WORKER,
  77. .id.cuda_id = 0
  78. };
  79. ret = starpu_driver_init(&d);
  80. if (ret != 0)
  81. error();
  82. while (some_condition)
  83. {
  84. ret = starpu_driver_run_once(&d);
  85. if (ret != 0)
  86. error();
  87. }
  88. ret = starpu_driver_deinit(&d);
  89. if (ret != 0)
  90. error();
  91. \endcode
  92. To add a new kind of device to the structure starpu_driver, one needs to:
  93. <ol>
  94. <li> Add a member to the union starpu_driver::id
  95. </li>
  96. <li> Modify the internal function <c>_starpu_launch_drivers()</c> to
  97. make sure the driver is not always launched.
  98. </li>
  99. <li> Modify the function starpu_driver_run() so that it can handle
  100. another kind of architecture.
  101. </li>
  102. <li> Write the new function <c>_starpu_run_foobar()</c> in the
  103. corresponding driver.
  104. </li>
  105. </ol>
  106. \section On-GPURendering On-GPU Rendering
  107. Graphical-oriented applications need to draw the result of their computations,
  108. typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
  109. interoperability permit to let CUDA directly work on the OpenGL buffers, making
  110. them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
  111. renderbuffer objects into CUDA. CUDA however imposes some technical
  112. constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
  113. to be the one that runs CUDA computations for that GPU.
  114. To achieve this with StarPU, pass the option
  115. \ref disable-cuda-memcpy-peer "--disable-cuda-memcpy-peer"
  116. to <c>./configure</c> (TODO: make it dynamic), OpenGL/GLUT has to be initialized
  117. first, and the interoperability mode has to
  118. be enabled by using the field
  119. starpu_conf::cuda_opengl_interoperability, and the driver loop has to
  120. be run by the application, by using the field
  121. starpu_conf::not_launched_drivers to prevent StarPU from running it in
  122. a separate thread, and by using starpu_driver_run() to run the loop.
  123. The examples <c>gl_interop</c> and <c>gl_interop_idle</c> show how it
  124. articulates in a simple case, where rendering is done in task
  125. callbacks. The former uses <c>glutMainLoopEvent</c> to make GLUT
  126. progress from the StarPU driver loop, while the latter uses
  127. <c>glutIdleFunc</c> to make StarPU progress from the GLUT main loop.
  128. Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
  129. the CUDA pointer at registration, for instance:
  130. \code{.c}
  131. /* Get the CUDA worker id */
  132. for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
  133. if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
  134. break;
  135. /* Build a CUDA pointer pointing at the OpenGL buffer */
  136. cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
  137. /* And register it to StarPU */
  138. starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
  139. output, num_bytes / sizeof(float4), sizeof(float4));
  140. /* The handle can now be used as usual */
  141. starpu_task_insert(&cl, STARPU_RW, handle, 0);
  142. /* ... */
  143. /* This gets back data into the OpenGL buffer */
  144. starpu_data_unregister(handle);
  145. \endcode
  146. and display it e.g. in the callback function.
  147. \section UsingStarPUWithMKL Using StarPU With MKL 11 (Intel Composer XE 2013)
  148. Some users had issues with MKL 11 and StarPU (versions 1.1rc1 and
  149. 1.0.5) on Linux with MKL, using 1 thread for MKL and doing all the
  150. parallelism using StarPU (no multithreaded tasks), setting the
  151. environment variable <c>MKL_NUM_THREADS</c> to <c>1</c>, and using the threaded MKL library,
  152. with <c>iomp5</c>.
  153. Using this configuration, StarPU only uses 1 core, no matter the value of
  154. \ref STARPU_NCPU. The problem is actually a thread pinning issue with MKL.
  155. The solution is to set the environment variable KMP_AFFINITY to <c>disabled</c>
  156. (http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/optaps/common/optaps_openmp_thread_affinity.htm).
  157. \section ThreadBindingOnNetBSD Thread Binding on NetBSD
  158. When using StarPU on a NetBSD machine, if the topology
  159. discovery library <c>hwloc</c> is used, thread binding will fail. To
  160. prevent the problem, you should at least use the version 1.7 of
  161. <c>hwloc</c>, and also issue the following call:
  162. \verbatim
  163. $ sysctl -w security.models.extensions.user_set_cpu_affinity=1
  164. \endverbatim
  165. Or add the following line in the file <c>/etc/sysctl.conf</c>
  166. \verbatim
  167. security.models.extensions.user_set_cpu_affinity=1
  168. \endverbatim
  169. \section StarPUEatsCPUs StarPU permanently eats 100% of all CPUs
  170. Yes, this is on purpose.
  171. By default, StarPU uses active polling on task queues, so as to minimize wake-up
  172. latency for better overall performance.
  173. If eating CPU time is a problem (e.g. application running on a desktop),
  174. pass option \ref enable-blocking-drivers "--enable-blocking-drivers" to
  175. <c>./configure</c>. This will add some overhead when putting CPU workers to
  176. sleep or waking them, but avoid eating 100% CPU permanently.
  177. \section PauseResume Interleaving StarPU and non-StarPU code
  178. If your application only partially uses StarPU, and you do not want to
  179. call starpu_init() / starpu_shutdown() at the beginning/end
  180. of each section, StarPU workers will poll for work between the
  181. sections. To avoid this behavior, you can "pause" StarPU with the
  182. starpu_pause() function. This will prevent the StarPU workers from
  183. accepting new work (tasks that are already in progress will not be
  184. frozen), and stop them from polling for more work.
  185. Note that this does not prevent you from submitting new tasks, but
  186. they won't execute until starpu_resume() is called. Also note
  187. that StarPU must not be paused when you call starpu_shutdown(), and
  188. that this function pair works in a push/pull manner, i.e you need to
  189. match the number of calls to these functions to clear their effect.
  190. One way to use these functions could be:
  191. \code{.c}
  192. starpu_init(NULL);
  193. starpu_pause(); // To submit all the tasks without a single one executing
  194. submit_some_tasks();
  195. starpu_resume(); // The tasks start executing
  196. starpu_task_wait_for_all();
  197. starpu_pause(); // Stop the workers from polling
  198. // Non-StarPU code
  199. starpu_resume();
  200. // ...
  201. starpu_shutdown();
  202. \endcode
  203. \section GPUEatingCores When running with CUDA or OpenCL devices, I am seeing less CPU cores
  204. Yes, this is on purpose.
  205. Since GPU devices are way faster than CPUs, StarPU needs to react quickly when
  206. a task is finished, to feed the GPU with another task (StarPU actually submits
  207. a couple of tasks in advance so as to pipeline this, but filling the pipeline
  208. still has to be happening often enough), and thus it has to dedicate threads for
  209. this, and this is a very CPU-consuming duty. StarPU thus dedicates one CPU core
  210. for driving each GPU by default.
  211. Such dedication is also useful when a codelet is hybrid, i.e. while kernels are
  212. running on the GPU, the codelet can run some computation, which thus be run by
  213. the CPU core instead of driving the GPU.
  214. One can choose to dedicate only one thread for all the CUDA devices by setting
  215. the STARPU_CUDA_THREAD_PER_DEV environment variable to 1. The application
  216. however should use STARPU_CUDA_ASYNC on its CUDA codelets (asynchronous
  217. execution), otherwise the execution of a synchronous CUDA codelet will
  218. monopolize the thread, and other CUDA devices will thus starve while it is
  219. executing.
  220. \section CUDADrivers StarPU does not see my CUDA device
  221. First make sure that CUDA is properly running outside StarPU: build and
  222. run the following program with -lcudart:
  223. \code{.c}
  224. #include <stdio.h>
  225. #include <cuda.h>
  226. #include <cuda_runtime.h>
  227. int main(void)
  228. {
  229. int n, i, version;
  230. cudaError_t err;
  231. err = cudaGetDeviceCount(&n);
  232. if (err)
  233. {
  234. fprintf(stderr,"cuda error %d\n", err);
  235. exit(1);
  236. }
  237. cudaDriverGetVersion(&version);
  238. printf("driver version %d\n", version);
  239. cudaRuntimeGetVersion(&version);
  240. printf("runtime version %d\n", version);
  241. printf("\n");
  242. for (i = 0; i < n; i++)
  243. {
  244. struct cudaDeviceProp props;
  245. printf("CUDA%d\n", i);
  246. err = cudaGetDeviceProperties(&props, i);
  247. if (err)
  248. {
  249. fprintf(stderr,"cuda error %d\n", err);
  250. continue;
  251. }
  252. printf("%s\n", props.name);
  253. printf("%0.3f GB\n", (float) props.totalGlobalMem / (1<<30));
  254. printf("%u MP\n", props.multiProcessorCount);
  255. printf("\n");
  256. }
  257. return 0;
  258. }
  259. \endcode
  260. If that program does not find your device, the problem is not at the StarPU
  261. level, but the CUDA drivers, check the documentation of your CUDA
  262. setup.
  263. \section OpenCLDrivers StarPU does not see my OpenCL device
  264. First make sure that OpenCL is properly running outside StarPU: build and
  265. run the following program with -lOpenCL:
  266. \code{.c}
  267. #include <CL/cl.h>
  268. #include <stdio.h>
  269. #include <assert.h>
  270. int main(void)
  271. {
  272. cl_device_id did[16];
  273. cl_int err;
  274. cl_platform_id pid, pids[16];
  275. cl_uint nbplat, nb;
  276. char buf[128];
  277. size_t size;
  278. int i, j;
  279. err = clGetPlatformIDs(sizeof(pids)/sizeof(pids[0]), pids, &nbplat);
  280. assert(err == CL_SUCCESS);
  281. printf("%u platforms\n", nbplat);
  282. for (j = 0; j < nbplat; j++)
  283. {
  284. pid = pids[j];
  285. printf(" platform %d\n", j);
  286. err = clGetPlatformInfo(pid, CL_PLATFORM_VERSION, sizeof(buf)-1, buf, &size);
  287. assert(err == CL_SUCCESS);
  288. buf[size] = 0;
  289. printf(" platform version %s\n", buf);
  290. err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, sizeof(did)/sizeof(did[0]), did, &nb);
  291. assert(err == CL_SUCCESS);
  292. printf("%d devices\n", nb);
  293. for (i = 0; i < nb; i++)
  294. {
  295. err = clGetDeviceInfo(did[i], CL_DEVICE_VERSION, sizeof(buf)-1, buf, &size);
  296. buf[size] = 0;
  297. printf(" device %d version %s\n", i, buf);
  298. }
  299. }
  300. return 0;
  301. }
  302. \endcode
  303. If that program does not find your device, the problem is not at the StarPU
  304. level, but the OpenCL drivers, check the documentation of your OpenCL
  305. implementation.
  306. \section IncorrectPerformanceModelFile I keep getting a "Incorrect performance model file" error
  307. The performance model file, used by StarPU to record the performance of
  308. codelets, seem to have been corrupted. Perhaps a previous run of StarPU stopped
  309. abruptly, and thus could not save it properly. You can have a look at the file
  310. if you can fix it, but the simplest way is to just remove the file and run
  311. again, StarPU will just have to re-perform calibration for the corresponding codelet.
  312. */