123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356 |
- for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
- if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
- break;
- cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
- starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
- output, num_bytes / sizeof(float4), sizeof(float4));
- starpu_task_insert(&cl, STARPU_RW, handle, 0);
- starpu_data_unregister(handle);
- \endcode
- and display it e.g. in the callback function.
- \section UsingStarPUWithMKL Using StarPU With MKL 11 (Intel Composer XE 2013)
- Some users had issues with MKL 11 and StarPU (versions 1.1rc1 and
- 1.0.5) on Linux with MKL, using 1 thread for MKL and doing all the
- parallelism using StarPU (no multithreaded tasks), setting the
- environment variable <c>MKL_NUM_THREADS</c> to <c>1</c>, and using the threaded MKL library,
- with <c>iomp5</c>.
- Using this configuration, StarPU only uses 1 core, no matter the value of
- \ref STARPU_NCPU. The problem is actually a thread pinning issue with MKL.
- The solution is to set the environment variable KMP_AFFINITY to <c>disabled</c>
- (http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/optaps/common/optaps_openmp_thread_affinity.htm).
- \section ThreadBindingOnNetBSD Thread Binding on NetBSD
- When using StarPU on a NetBSD machine, if the topology
- discovery library <c>hwloc</c> is used, thread binding will fail. To
- prevent the problem, you should at least use the version 1.7 of
- <c>hwloc</c>, and also issue the following call:
- \verbatim
- $ sysctl -w security.models.extensions.user_set_cpu_affinity=1
- \endverbatim
- Or add the following line in the file <c>/etc/sysctl.conf</c>
- \verbatim
- security.models.extensions.user_set_cpu_affinity=1
- \endverbatim
- \section PauseResume Interleaving StarPU and non-StarPU code
- If your application only partially uses StarPU, and you do not want to
- call starpu_init() / starpu_shutdown() at the beginning/end
- of each section, StarPU workers will poll for work between the
- sections. To avoid this behavior, you can "pause" StarPU with the
- starpu_pause() function. This will prevent the StarPU workers from
- accepting new work (tasks that are already in progress will not be
- frozen), and stop them from polling for more work.
- Note that this does not prevent you from submitting new tasks, but
- they won't execute until starpu_resume() is called. Also note
- that StarPU must not be paused when you call starpu_shutdown(), and
- that this function pair works in a push/pull manner, i.e you need to
- match the number of calls to these functions to clear their effect.
- One way to use these functions could be:
- \code{.c}
- starpu_init(NULL);
- starpu_pause(); // To submit all the tasks without a single one executing
- submit_some_tasks();
- starpu_resume(); // The tasks start executing
- starpu_task_wait_for_all();
- starpu_pause(); // Stop the workers from polling
- // Non-StarPU code
- starpu_resume();
- // ...
- starpu_shutdown();
- \endcode
- \section GPUEatingCores When running with CUDA or OpenCL devices, I am seeing less CPU cores
- Yes, this is on purpose.
- Since GPU devices are way faster than CPUs, StarPU needs to react quickly when
- a task is finished, to feed the GPU with another task (StarPU actually submits
- a couple of tasks in advance so as to pipeline this, but filling the pipeline
- still has to be happening often enough), and thus it has to dedicate threads for
- this, and this is a very CPU-consuming duty. StarPU thus dedicates one CPU core
- for driving each GPU by default.
- Such dedication is also useful when a codelet is hybrid, i.e. while kernels are
- running on the GPU, the codelet can run some computation, which thus be run by
- the CPU core instead of driving the GPU.
- One can choose to dedicate only one thread for all the CUDA devices by setting
- the STARPU_CUDA_THREAD_PER_DEV environment variable to 1. The application
- however should use STARPU_CUDA_ASYNC on its CUDA codelets (asynchronous
- execution), otherwise the execution of a synchronous CUDA codelet will
- monopolize the thread, and other CUDA devices will thus starve while it is
- executing.
- \section CUDADrivers StarPU does not see my CUDA device
- First make sure that CUDA is properly running outside StarPU: build and
- run the following program with -lcudart:
- \code{.c}
- int main(void) {
- int n, i, version;
- cudaError_t err;
- err = cudaGetDeviceCount(&n);
- if (err)
- {
- fprintf(stderr,"cuda error %d\n", err);
- exit(1);
- }
- cudaDriverGetVersion(&version);
- printf("driver version %d\n", version);
- cudaRuntimeGetVersion(&version);
- printf("runtime version %d\n", version);
- printf("\n");
- for (i = 0; i < n; i++) {
- struct cudaDeviceProp props;
- printf("CUDA%d\n", i);
- err = cudaGetDeviceProperties(&props, i);
- if (err)
- {
- fprintf(stderr,"cuda error %d\n", err);
- continue;
- }
- printf("%s\n", props.name);
- printf("%0.3f GB\n", (float) props.totalGlobalMem / (1<<30));
- printf("%u MP\n", props.multiProcessorCount);
- printf("\n");
- }
- return 0;
- }
- \endcode
- If that program does not find your device, the problem is not at the StarPU
- level, but the CUDA drivers, check the documentation of your CUDA
- setup.
- \section OpenCLDrivers StarPU does not see my OpenCL device
- First make sure that OpenCL is properly running outside StarPU: build and
- run the following program with -lOpenCL:
- \code{.c}
- int main(void) {
- cl_device_id did[16];
- cl_int err;
- cl_platform_id pid, pids[16];
- cl_uint nbplat, nb;
- char buf[128];
- size_t size;
- int i, j;
- err = clGetPlatformIDs(sizeof(pids)/sizeof(pids[0]), pids, &nbplat);
- assert(err == CL_SUCCESS);
- printf("%u platforms\n", nbplat);
- for (j = 0; j < nbplat; j++) {
- pid = pids[j];
- printf(" platform %d\n", j);
- err = clGetPlatformInfo(pid, CL_PLATFORM_VERSION, sizeof(buf)-1, buf, &size);
- assert(err == CL_SUCCESS);
- buf[size] = 0;
- printf(" platform version %s\n", buf);
- err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, sizeof(did)/sizeof(did[0]), did, &nb);
- assert(err == CL_SUCCESS);
- printf("%d devices\n", nb);
- for (i = 0; i < nb; i++) {
- err = clGetDeviceInfo(did[i], CL_DEVICE_VERSION, sizeof(buf)-1, buf, &size);
- buf[size] = 0;
- printf(" device %d version %s\n", i, buf);
- }
- }
- return 0;
- }
- \endcode
- If that program does not find your device, the problem is not at the StarPU
- level, but the OpenCL drivers, check the documentation of your OpenCL
- implementation.
- \section IncorrectPerformanceModelFile I keep getting a "Incorrect performance model file" error
- The performance model file, used by StarPU to record the performance of
- codelets, seem to have been corrupted. Perhaps a previous run of StarPU stopped
- abruptly, and thus could not save it properly. You can have a look at the file
- if you can fix it, but the simplest way is to just remove the file and run
- again, StarPU will just have to re-perform calibration for the corresponding codelet.
- */
|